Compare commits

..

1 Commits

Author SHA1 Message Date
b4b6407716 WIP Implement RDMA v2 based on IBV_WR_RDMA_WRITE with remote buffer management
One BIG FIXME remaining - handling large operations :))
2023-02-26 00:26:39 +03:00
75 changed files with 1073 additions and 1204 deletions

View File

@@ -1,7 +1,7 @@
cmake_minimum_required(VERSION 2.8.12)
cmake_minimum_required(VERSION 2.8)
project(vitastor)
set(VERSION "0.8.7")
set(VERSION "0.8.5")
add_subdirectory(src)

View File

@@ -1,4 +1,4 @@
VERSION ?= v0.8.7
VERSION ?= v0.8.5
all: build push

View File

@@ -49,7 +49,7 @@ spec:
capabilities:
add: ["SYS_ADMIN"]
allowPrivilegeEscalation: true
image: vitalif/vitastor-csi:v0.8.7
image: vitalif/vitastor-csi:v0.8.5
args:
- "--node=$(NODE_ID)"
- "--endpoint=$(CSI_ENDPOINT)"

View File

@@ -116,7 +116,7 @@ spec:
privileged: true
capabilities:
add: ["SYS_ADMIN"]
image: vitalif/vitastor-csi:v0.8.7
image: vitalif/vitastor-csi:v0.8.5
args:
- "--node=$(NODE_ID)"
- "--endpoint=$(CSI_ENDPOINT)"

View File

@@ -5,7 +5,7 @@ package vitastor
const (
vitastorCSIDriverName = "csi.vitastor.io"
vitastorCSIDriverVersion = "0.8.7"
vitastorCSIDriverVersion = "0.8.5"
)
// Config struct fills the parameters of request or user input

View File

@@ -6,11 +6,11 @@ package vitastor
import (
"context"
"encoding/json"
"fmt"
"strings"
"bytes"
"strconv"
"time"
"fmt"
"os"
"os/exec"
"io/ioutil"
@@ -21,6 +21,8 @@ import (
"google.golang.org/grpc/codes"
"google.golang.org/grpc/status"
"go.etcd.io/etcd/clientv3"
"github.com/container-storage-interface/spec/lib/go/csi"
)
@@ -112,34 +114,6 @@ func GetConnectionParams(params map[string]string) (map[string]string, []string,
return ctxVars, etcdUrl, etcdPrefix
}
func invokeCLI(ctxVars map[string]string, args []string) ([]byte, error)
{
if (ctxVars["etcdUrl"] != "")
{
args = append(args, "--etcd_address", ctxVars["etcdUrl"])
}
if (ctxVars["etcdPrefix"] != "")
{
args = append(args, "--etcd_prefix", ctxVars["etcdPrefix"])
}
if (ctxVars["configPath"] != "")
{
args = append(args, "--config_path", ctxVars["configPath"])
}
c := exec.Command("/usr/bin/vitastor-cli", args...)
var stdout, stderr bytes.Buffer
c.Stdout = &stdout
c.Stderr = &stderr
err := c.Run()
stderrStr := string(stderr.Bytes())
if (err != nil)
{
klog.Errorf("vitastor-cli %s failed: %s, status %s\n", strings.Join(args, " "), stderrStr, err)
return nil, status.Error(codes.Internal, stderrStr+" (status "+err.Error()+")")
}
return stdout.Bytes(), nil
}
// Create the volume
func (cs *ControllerServer) CreateVolume(ctx context.Context, req *csi.CreateVolumeRequest) (*csi.CreateVolumeResponse, error)
{
@@ -172,41 +146,128 @@ func (cs *ControllerServer) CreateVolume(ctx context.Context, req *csi.CreateVol
volSize = ((capRange.GetRequiredBytes() + MB - 1) / MB) * MB
}
ctxVars, etcdUrl, _ := GetConnectionParams(req.Parameters)
// FIXME: The following should PROBABLY be implemented externally in a management tool
ctxVars, etcdUrl, etcdPrefix := GetConnectionParams(req.Parameters)
if (len(etcdUrl) == 0)
{
return nil, status.Error(codes.InvalidArgument, "no etcdUrl in storage class configuration and no etcd_address in vitastor.conf")
}
// Create image using vitastor-cli
_, err := invokeCLI(ctxVars, []string{ "create", volName, "-s", fmt.Sprintf("%v", volSize), "--pool", fmt.Sprintf("%v", poolId) })
// Connect to etcd
cli, err := clientv3.New(clientv3.Config{
DialTimeout: ETCD_TIMEOUT,
Endpoints: etcdUrl,
})
if (err != nil)
{
if (strings.Index(err.Error(), "already exists") > 0)
return nil, status.Error(codes.Internal, "failed to connect to etcd at "+strings.Join(etcdUrl, ",")+": "+err.Error())
}
defer cli.Close()
var imageId uint64 = 0
for
{
// Check if the image exists
ctx, cancel := context.WithTimeout(context.Background(), ETCD_TIMEOUT)
resp, err := cli.Get(ctx, etcdPrefix+"/index/image/"+volName)
cancel()
if (err != nil)
{
stat, err := invokeCLI(ctxVars, []string{ "ls", "--json", volName })
return nil, status.Error(codes.Internal, "failed to read key from etcd: "+err.Error())
}
if (len(resp.Kvs) > 0)
{
kv := resp.Kvs[0]
var v InodeIndex
err := json.Unmarshal(kv.Value, &v)
if (err != nil)
{
return nil, err
return nil, status.Error(codes.Internal, "invalid /index/image/"+volName+" key in etcd: "+err.Error())
}
var inodeCfg []InodeConfig
err = json.Unmarshal(stat, &inodeCfg)
poolId = v.PoolId
imageId = v.Id
inodeCfgKey := fmt.Sprintf("/config/inode/%d/%d", poolId, imageId)
ctx, cancel := context.WithTimeout(context.Background(), ETCD_TIMEOUT)
resp, err := cli.Get(ctx, etcdPrefix+inodeCfgKey)
cancel()
if (err != nil)
{
return nil, status.Error(codes.Internal, "Invalid JSON in vitastor-cli ls: "+err.Error())
return nil, status.Error(codes.Internal, "failed to read key from etcd: "+err.Error())
}
if (len(inodeCfg) == 0)
if (len(resp.Kvs) == 0)
{
return nil, status.Error(codes.Internal, "vitastor-cli create said that image already exists, but ls can't find it")
return nil, status.Error(codes.Internal, "missing "+inodeCfgKey+" key in etcd")
}
if (inodeCfg[0].Size < uint64(volSize))
var inodeCfg InodeConfig
err = json.Unmarshal(resp.Kvs[0].Value, &inodeCfg)
if (err != nil)
{
return nil, status.Error(codes.Internal, "invalid "+inodeCfgKey+" key in etcd: "+err.Error())
}
if (inodeCfg.Size < uint64(volSize))
{
return nil, status.Error(codes.Internal, "image "+volName+" is already created, but size is less than expected")
}
}
else
{
return nil, err
// Find a free ID
// Create image metadata in a transaction verifying that the image doesn't exist yet AND ID is still free
maxIdKey := fmt.Sprintf("%s/index/maxid/%d", etcdPrefix, poolId)
ctx, cancel := context.WithTimeout(context.Background(), ETCD_TIMEOUT)
resp, err := cli.Get(ctx, maxIdKey)
cancel()
if (err != nil)
{
return nil, status.Error(codes.Internal, "failed to read key from etcd: "+err.Error())
}
var modRev int64
var nextId uint64
if (len(resp.Kvs) > 0)
{
var err error
nextId, err = strconv.ParseUint(string(resp.Kvs[0].Value), 10, 64)
if (err != nil)
{
return nil, status.Error(codes.Internal, maxIdKey+" contains invalid ID")
}
modRev = resp.Kvs[0].ModRevision
nextId++
}
else
{
nextId = 1
}
inodeIdxJson, _ := json.Marshal(InodeIndex{
Id: nextId,
PoolId: poolId,
})
inodeCfgJson, _ := json.Marshal(InodeConfig{
Name: volName,
Size: uint64(volSize),
})
ctx, cancel = context.WithTimeout(context.Background(), ETCD_TIMEOUT)
txnResp, err := cli.Txn(ctx).If(
clientv3.Compare(clientv3.ModRevision(fmt.Sprintf("%s/index/maxid/%d", etcdPrefix, poolId)), "=", modRev),
clientv3.Compare(clientv3.CreateRevision(fmt.Sprintf("%s/index/image/%s", etcdPrefix, volName)), "=", 0),
clientv3.Compare(clientv3.CreateRevision(fmt.Sprintf("%s/config/inode/%d/%d", etcdPrefix, poolId, nextId)), "=", 0),
).Then(
clientv3.OpPut(fmt.Sprintf("%s/index/maxid/%d", etcdPrefix, poolId), fmt.Sprintf("%d", nextId)),
clientv3.OpPut(fmt.Sprintf("%s/index/image/%s", etcdPrefix, volName), string(inodeIdxJson)),
clientv3.OpPut(fmt.Sprintf("%s/config/inode/%d/%d", etcdPrefix, poolId, nextId), string(inodeCfgJson)),
).Commit()
cancel()
if (err != nil)
{
return nil, status.Error(codes.Internal, "failed to commit transaction in etcd: "+err.Error())
}
if (txnResp.Succeeded)
{
imageId = nextId
break
}
// Start over if the transaction fails
}
}
@@ -238,12 +299,97 @@ func (cs *ControllerServer) DeleteVolume(ctx context.Context, req *csi.DeleteVol
}
volName := ctxVars["name"]
ctxVars, _, _ = GetConnectionParams(ctxVars)
_, etcdUrl, etcdPrefix := GetConnectionParams(ctxVars)
if (len(etcdUrl) == 0)
{
return nil, status.Error(codes.InvalidArgument, "no etcdUrl in storage class configuration and no etcd_address in vitastor.conf")
}
_, err = invokeCLI(ctxVars, []string{ "rm", volName })
cli, err := clientv3.New(clientv3.Config{
DialTimeout: ETCD_TIMEOUT,
Endpoints: etcdUrl,
})
if (err != nil)
{
return nil, err
return nil, status.Error(codes.Internal, "failed to connect to etcd at "+strings.Join(etcdUrl, ",")+": "+err.Error())
}
defer cli.Close()
// Find inode by name
ctx, cancel := context.WithTimeout(context.Background(), ETCD_TIMEOUT)
resp, err := cli.Get(ctx, etcdPrefix+"/index/image/"+volName)
cancel()
if (err != nil)
{
return nil, status.Error(codes.Internal, "failed to read key from etcd: "+err.Error())
}
if (len(resp.Kvs) == 0)
{
return nil, status.Error(codes.NotFound, "volume "+volName+" does not exist")
}
var idx InodeIndex
err = json.Unmarshal(resp.Kvs[0].Value, &idx)
if (err != nil)
{
return nil, status.Error(codes.Internal, "invalid /index/image/"+volName+" key in etcd: "+err.Error())
}
// Get inode config
inodeCfgKey := fmt.Sprintf("%s/config/inode/%d/%d", etcdPrefix, idx.PoolId, idx.Id)
ctx, cancel = context.WithTimeout(context.Background(), ETCD_TIMEOUT)
resp, err = cli.Get(ctx, inodeCfgKey)
cancel()
if (err != nil)
{
return nil, status.Error(codes.Internal, "failed to read key from etcd: "+err.Error())
}
if (len(resp.Kvs) == 0)
{
return nil, status.Error(codes.NotFound, "volume "+volName+" does not exist")
}
var inodeCfg InodeConfig
err = json.Unmarshal(resp.Kvs[0].Value, &inodeCfg)
if (err != nil)
{
return nil, status.Error(codes.Internal, "invalid "+inodeCfgKey+" key in etcd: "+err.Error())
}
// Delete inode data by invoking vitastor-cli
args := []string{
"rm-data", "--etcd_address", strings.Join(etcdUrl, ","),
"--pool", fmt.Sprintf("%d", idx.PoolId),
"--inode", fmt.Sprintf("%d", idx.Id),
}
if (ctxVars["configPath"] != "")
{
args = append(args, "--config_path", ctxVars["configPath"])
}
c := exec.Command("/usr/bin/vitastor-cli", args...)
var stderr bytes.Buffer
c.Stdout = nil
c.Stderr = &stderr
err = c.Run()
stderrStr := string(stderr.Bytes())
if (err != nil)
{
klog.Errorf("vitastor-cli rm-data failed: %s, status %s\n", stderrStr, err)
return nil, status.Error(codes.Internal, stderrStr+" (status "+err.Error()+")")
}
// Delete inode config in etcd
ctx, cancel = context.WithTimeout(context.Background(), ETCD_TIMEOUT)
txnResp, err := cli.Txn(ctx).Then(
clientv3.OpDelete(fmt.Sprintf("%s/index/image/%s", etcdPrefix, volName)),
clientv3.OpDelete(fmt.Sprintf("%s/config/inode/%d/%d", etcdPrefix, idx.PoolId, idx.Id)),
).Commit()
cancel()
if (err != nil)
{
return nil, status.Error(codes.Internal, "failed to delete keys in etcd: "+err.Error())
}
if (!txnResp.Succeeded)
{
return nil, status.Error(codes.Internal, "failed to delete keys in etcd: transaction failed")
}
return &csi.DeleteVolumeResponse{}, nil

4
debian/changelog vendored
View File

@@ -1,10 +1,10 @@
vitastor (0.8.7-1) unstable; urgency=medium
vitastor (0.8.5-1) unstable; urgency=medium
* Bugfixes
-- Vitaliy Filippov <vitalif@yourcmc.ru> Fri, 03 Jun 2022 02:09:44 +0300
vitastor (0.8.7-1) unstable; urgency=medium
vitastor (0.8.5-1) unstable; urgency=medium
* Implement NFS proxy
* Add documentation

View File

@@ -34,8 +34,8 @@ RUN set -e -x; \
mkdir -p /root/packages/vitastor-$REL; \
rm -rf /root/packages/vitastor-$REL/*; \
cd /root/packages/vitastor-$REL; \
cp -r /root/vitastor vitastor-0.8.7; \
cd vitastor-0.8.7; \
cp -r /root/vitastor vitastor-0.8.5; \
cd vitastor-0.8.5; \
ln -s /root/fio-build/fio-*/ ./fio; \
FIO=$(head -n1 fio/debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
ls /usr/include/linux/raw.h || cp ./debian/raw.h /usr/include/linux/raw.h; \
@@ -48,8 +48,8 @@ RUN set -e -x; \
rm -rf a b; \
echo "dep:fio=$FIO" > debian/fio_version; \
cd /root/packages/vitastor-$REL; \
tar --sort=name --mtime='2020-01-01' --owner=0 --group=0 --exclude=debian -cJf vitastor_0.8.7.orig.tar.xz vitastor-0.8.7; \
cd vitastor-0.8.7; \
tar --sort=name --mtime='2020-01-01' --owner=0 --group=0 --exclude=debian -cJf vitastor_0.8.5.orig.tar.xz vitastor-0.8.5; \
cd vitastor-0.8.5; \
V=$(head -n1 debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
DEBFULLNAME="Vitaliy Filippov <vitalif@yourcmc.ru>" dch -D $REL -v "$V""$REL" "Rebuild for $REL"; \
DEB_BUILD_OPTIONS=nocheck dpkg-buildpackage --jobs=auto -sa; \

View File

@@ -19,7 +19,6 @@ between clients, OSDs and etcd.
- [rdma_max_sge](#rdma_max_sge)
- [rdma_max_msg](#rdma_max_msg)
- [rdma_max_recv](#rdma_max_recv)
- [rdma_max_send](#rdma_max_send)
- [peer_connect_interval](#peer_connect_interval)
- [peer_connect_timeout](#peer_connect_timeout)
- [osd_idle_timeout](#osd_idle_timeout)
@@ -75,12 +74,6 @@ to work. For example, Mellanox ConnectX-3 and older adapters don't have
Implicit ODP, so they're unsupported by Vitastor. Run `ibv_devinfo -v` as
root to list available RDMA devices and their features.
Remember that you also have to configure your network switches if you use
RoCE/RoCEv2, otherwise you may experience unstable performance. Refer to
the manual of your network vendor for details about setting up the switch
for RoCEv2 correctly. Usually it means setting up Lossless Ethernet with
PFC (Priority Flow Control) and ECN (Explicit Congestion Notification).
## rdma_port_num
- Type: integer
@@ -123,30 +116,20 @@ required to change this parameter.
## rdma_max_msg
- Type: integer
- Default: 132096
- Default: 1048576
Maximum size of a single RDMA send or receive operation in bytes.
## rdma_max_recv
- Type: integer
- Default: 16
Maximum number of RDMA receive buffers per connection (RDMA requires
preallocated buffers to receive data). Each buffer is `rdma_max_msg` bytes
in size. So this setting directly affects memory usage: a single Vitastor
RDMA client uses `rdma_max_recv * rdma_max_msg * OSD_COUNT` bytes of memory.
Default is roughly 2 MB * number of OSDs.
## rdma_max_send
- Type: integer
- Default: 8
Maximum number of outstanding RDMA send operations per connection. Should be
less than `rdma_max_recv` so the receiving side doesn't run out of buffers.
Doesn't affect memory usage - additional memory isn't allocated for send
operations.
Maximum number of parallel RDMA receive operations. Note that this number
of receive buffers `rdma_max_msg` in size are allocated for each client,
so this setting actually affects memory usage. This is because RDMA receive
operations are (sadly) still not zero-copy in Vitastor. It may be fixed in
later versions.
## peer_connect_interval

View File

@@ -19,7 +19,6 @@
- [rdma_max_sge](#rdma_max_sge)
- [rdma_max_msg](#rdma_max_msg)
- [rdma_max_recv](#rdma_max_recv)
- [rdma_max_send](#rdma_max_send)
- [peer_connect_interval](#peer_connect_interval)
- [peer_connect_timeout](#peer_connect_timeout)
- [osd_idle_timeout](#osd_idle_timeout)
@@ -79,13 +78,6 @@ Implicit On-Demand Paging (Implicit ODP) и Scatter/Gather (SG). Наприме
суперпользователя, чтобы посмотреть список доступных RDMA-устройств, их
параметры и возможности.
Обратите внимание, что если вы используете RoCE/RoCEv2, вам также необходимо
правильно настроить для него коммутаторы, иначе вы можете столкнуться с
нестабильной производительностью. Подробную информацию о настройке
коммутатора для RoCEv2 ищите в документации производителя. Обычно это
подразумевает настройку сети без потерь на основе PFC (Priority Flow
Control) и ECN (Explicit Congestion Notification).
## rdma_port_num
- Тип: целое число
@@ -129,32 +121,22 @@ OSD в любом случае согласовывают реальное зн
## rdma_max_msg
- Тип: целое число
- Значение по умолчанию: 132096
- Значение по умолчанию: 1048576
Максимальный размер одной RDMA-операции отправки или приёма.
## rdma_max_recv
- Тип: целое число
- Значение по умолчанию: 16
Максимальное число буферов для RDMA-приёма данных на одно соединение
(RDMA требует заранее выделенных буферов для приёма данных). Каждый буфер
имеет размер `rdma_max_msg` байт. Таким образом, настройка прямо влияет на
потребление памяти - один Vitastor-клиент с RDMA использует
`rdma_max_recv * rdma_max_msg * ЧИСЛО_OSD` байт памяти, по умолчанию -
примерно 2 МБ * число OSD.
## rdma_max_send
- Тип: целое число
- Значение по умолчанию: 8
Максимальное число RDMA-операций отправки, отправляемых в очередь одного
соединения. Желательно, чтобы оно было меньше `rdma_max_recv`, чтобы
у принимающей стороны в процессе работы не заканчивались буферы на приём.
Не влияет на потребление памяти - дополнительная память на операции отправки
не выделяется.
Максимальное число параллельных RDMA-операций получения данных. Следует
иметь в виду, что данное число буферов размером `rdma_max_msg` выделяется
для каждого подключённого клиентского соединения, так что данная настройка
влияет на потребление памяти. Это так потому, что RDMA-приём данных в
Vitastor, увы, всё равно не является zero-copy, т.е. всё равно 1 раз
копирует данные в памяти. Данная особенность, возможно, будет исправлена в
более новых версиях Vitastor.
## peer_connect_interval

View File

@@ -53,12 +53,6 @@
to work. For example, Mellanox ConnectX-3 and older adapters don't have
Implicit ODP, so they're unsupported by Vitastor. Run `ibv_devinfo -v` as
root to list available RDMA devices and their features.
Remember that you also have to configure your network switches if you use
RoCE/RoCEv2, otherwise you may experience unstable performance. Refer to
the manual of your network vendor for details about setting up the switch
for RoCEv2 correctly. Usually it means setting up Lossless Ethernet with
PFC (Priority Flow Control) and ECN (Explicit Congestion Notification).
info_ru: |
Название RDMA-устройства для связи с Vitastor OSD (например, "rocep5s0f0").
Имейте в виду, что поддержка RDMA в Vitastor требует функций устройства
@@ -67,13 +61,6 @@
потому не поддерживаются в Vitastor. Запустите `ibv_devinfo -v` от имени
суперпользователя, чтобы посмотреть список доступных RDMA-устройств, их
параметры и возможности.
Обратите внимание, что если вы используете RoCE/RoCEv2, вам также необходимо
правильно настроить для него коммутаторы, иначе вы можете столкнуться с
нестабильной производительностью. Подробную информацию о настройке
коммутатора для RoCEv2 ищите в документации производителя. Обычно это
подразумевает настройку сети без потерь на основе PFC (Priority Flow
Control) и ECN (Explicit Congestion Notification).
- name: rdma_port_num
type: int
default: 1
@@ -127,39 +114,26 @@
так что менять этот параметр обычно не нужно.
- name: rdma_max_msg
type: int
default: 132096
default: 1048576
info: Maximum size of a single RDMA send or receive operation in bytes.
info_ru: Максимальный размер одной RDMA-операции отправки или приёма.
- name: rdma_max_recv
type: int
default: 16
info: |
Maximum number of RDMA receive buffers per connection (RDMA requires
preallocated buffers to receive data). Each buffer is `rdma_max_msg` bytes
in size. So this setting directly affects memory usage: a single Vitastor
RDMA client uses `rdma_max_recv * rdma_max_msg * OSD_COUNT` bytes of memory.
Default is roughly 2 MB * number of OSDs.
info_ru: |
Максимальное число буферов для RDMA-приёма данных на одно соединение
(RDMA требует заранее выделенных буферов для приёма данных). Каждый буфер
имеет размер `rdma_max_msg` байт. Таким образом, настройка прямо влияет на
потребление памяти - один Vitastor-клиент с RDMA использует
`rdma_max_recv * rdma_max_msg * ЧИСЛО_OSD` байт памяти, по умолчанию -
примерно 2 МБ * число OSD.
- name: rdma_max_send
type: int
default: 8
info: |
Maximum number of outstanding RDMA send operations per connection. Should be
less than `rdma_max_recv` so the receiving side doesn't run out of buffers.
Doesn't affect memory usage - additional memory isn't allocated for send
operations.
Maximum number of parallel RDMA receive operations. Note that this number
of receive buffers `rdma_max_msg` in size are allocated for each client,
so this setting actually affects memory usage. This is because RDMA receive
operations are (sadly) still not zero-copy in Vitastor. It may be fixed in
later versions.
info_ru: |
Максимальное число RDMA-операций отправки, отправляемых в очередь одного
соединения. Желательно, чтобы оно было меньше `rdma_max_recv`, чтобы
у принимающей стороны в процессе работы не заканчивались буферы на приём.
Не влияет на потребление памяти - дополнительная память на операции отправки
не выделяется.
Максимальное число параллельных RDMA-операций получения данных. Следует
иметь в виду, что данное число буферов размером `rdma_max_msg` выделяется
для каждого подключённого клиентского соединения, так что данная настройка
влияет на потребление памяти. Это так потому, что RDMA-приём данных в
Vitastor, увы, всё равно не является zero-copy, т.е. всё равно 1 раз
копирует данные в памяти. Данная особенность, возможно, будет исправлена в
более новых версиях Vitastor.
- name: peer_connect_interval
type: sec
min: 1

View File

@@ -35,24 +35,15 @@ Write amplification for 4 KB blocks is usually 3-5 in Vitastor:
If you manage to get an SSD which handles 512 byte blocks well (Optane?) you may
lower 1, 3 and 4 to 512 bytes (1/8 of data size) and get WA as low as 2.375.
Implemented NVDIMM support can basically eliminate WA at all - all extra writes will
go to DRAM memory. But this requires a test cluster with NVDIMM - please contact me
if you want to provide me with such cluster for tests.
Lazy fsync also reduces WA for parallel workloads because journal blocks are only
written when they fill up or fsync is requested.
## In Practice
In practice, using tests from [Understanding Performance](understanding.en.md), decent TCP network,
good server-grade SSD/NVMe drives and disabled CPU power saving, you should head for:
In practice, using tests from [Understanding Performance](understanding.en.md)
and good server-grade SSD/NVMe drives, you should head for:
- At least 5000 T1Q1 replicated read and write iops (maximum 0.2ms latency)
- At least 5000 T1Q1 EC read IOPS and at least 2200 EC write IOPS (maximum 0.45ms latency)
- At least ~80k parallel read iops or ~30k write iops per 1 core (1 OSD)
- Disk-speed or wire-speed linear reads and writes, whichever is the bottleneck in your case
Lower results may mean that you have bad drives, bad network or some kind of misconfiguration.
Current latency records:
- 9668 T1Q1 replicated write iops (0.103 ms latency) with TCP and NVMe
- 9143 T1Q1 replicated read iops (0.109 ms latency) with TCP and NVMe

View File

@@ -36,25 +36,6 @@ WA (мультипликатор записи) для 4 КБ блоков в Vit
Если вы найдёте SSD, хорошо работающий с 512-байтными блоками данных (Optane?),
то 1, 3 и 4 можно снизить до 512 байт (1/8 от размера данных) и получить WA всего 2.375.
Если реализовать поддержку NVDIMM, то WA можно, условно говоря, ликвидировать вообще - все
дополнительные операции записи смогут обслуживаться DRAM памятью. Но для этого необходим
тестовый кластер с NVDIMM - пишите, если готовы предоставить такой для тестов.
Кроме того, WA снижается при использовании отложенного/ленивого сброса при параллельной
нагрузке, т.к. блоки журнала записываются на диск только когда они заполняются или явным
образом запрашивается fsync.
## На практике
На практике, используя тесты fio со страницы [Понимание сути производительности систем хранения](understanding.ru.md),
нормальную TCP-сеть, хорошие серверные SSD/NVMe, при отключённом энергосбережении процессоров вы можете рассчитывать на:
- От 5000 IOPS в 1 поток (T1Q1) и на чтение, и на запись при использовании репликации (задержка до 0.2мс)
- От 5000 IOPS в 1 поток (T1Q1) на чтение и 2200 IOPS в 1 поток на запись при использовании EC (задержка до 0.45мс)
- От 80000 IOPS на чтение в параллельном режиме на 1 ядро, от 30000 IOPS на запись на 1 ядро (на 1 OSD)
- Скорость параллельного линейного чтения и записи, равная меньшему значению из скорости дисков или сети
Худшие результаты означают, что у вас либо медленные диски, либо медленная сеть, либо что-то неправильно настроено.
Зафиксированный на данный момент рекорд задержки:
- 9668 IOPS (0.103 мс задержка) в 1 поток (T1Q1) на запись с TCP и NVMe при использовании репликации
- 9143 IOPS (0.109 мс задержка) в 1 поток (T1Q1) на чтение с TCP и NVMe при использовании репликации

View File

@@ -1,4 +1,4 @@
[Documentation](../../README.md#documentation) → Usage → Disk management tool
[Documentation](../../README.md#documentation) → Usage → Disk Tool
-----

View File

@@ -1,4 +1,4 @@
[Документация](../../README-ru.md#документация) → Использование → Инструмент управления дисками
[Документация](../../README-ru.md#документация) → Использование → Управление дисками
-----

View File

@@ -51,9 +51,8 @@ const etcd_tree = {
// THIS IS JUST A POOR MAN'S CONFIG DOCUMENTATION
// etcd connection
config_path: "/etc/vitastor/vitastor.conf",
etcd_prefix: "/vitastor",
// etcd connection - configurable online
etcd_address: "10.0.115.10:2379/v3",
etcd_prefix: "/vitastor",
// mon
etcd_mon_ttl: 30, // min: 10
etcd_mon_timeout: 1000, // ms. min: 0
@@ -71,15 +70,14 @@ const etcd_tree = {
rdma_gid_index: 0,
rdma_mtu: 4096,
rdma_max_sge: 128,
rdma_max_send: 8,
rdma_max_recv: 16,
rdma_max_msg: 132096,
rdma_max_send: 32,
rdma_max_recv: 8,
rdma_max_msg: 1048576,
log_level: 0,
block_size: 131072,
disk_alignment: 4096,
bitmap_granularity: 4096,
immediate_commit: false, // 'all' or 'small'
// client and osd - configurable online
log_level: 0,
client_dirty_limit: 33554432,
peer_connect_interval: 5, // seconds. min: 1
peer_connect_timeout: 5, // seconds. min: 1
@@ -97,19 +95,18 @@ const etcd_tree = {
osd_network: null, // "192.168.7.0/24" or an array of masks
bind_address: "0.0.0.0",
bind_port: 0,
readonly: false,
osd_memlock: false,
// osd - configurable online
autosync_interval: 5,
autosync_writes: 128,
client_queue_depth: 128, // unused
recovery_queue_depth: 4,
recovery_sync_batch: 16,
readonly: false,
no_recovery: false,
no_rebalance: false,
print_stats_interval: 3,
slow_log_interval: 10,
inode_vanish_time: 60,
osd_memlock: false,
// blockstore - fixed in superblock
block_size,
disk_alignment,
@@ -128,15 +125,14 @@ const etcd_tree = {
meta_offset,
disable_meta_fsync,
disable_device_lock,
// blockstore - configurable offline
// blockstore - configurable
max_write_iodepth,
min_flusher_count: 1,
max_flusher_count: 256,
inmemory_metadata,
inmemory_journal,
journal_sector_buffer_count,
journal_no_same_sector_overwrites,
// blockstore - configurable online
max_write_iodepth,
min_flusher_count: 1,
max_flusher_count: 256,
throttle_small_writes: false,
throttle_target_iops: 100,
throttle_target_mbs: 100,

View File

@@ -50,7 +50,7 @@ from cinder.volume import configuration
from cinder.volume import driver
from cinder.volume import volume_utils
VERSION = '0.8.7'
VERSION = '0.8.5'
LOG = logging.getLogger(__name__)

View File

@@ -25,4 +25,4 @@ rm fio
mv fio-copy fio
FIO=`rpm -qi fio | perl -e 'while(<>) { /^Epoch[\s:]+(\S+)/ && print "$1:"; /^Version[\s:]+(\S+)/ && print $1; /^Release[\s:]+(\S+)/ && print "-$1"; }'`
perl -i -pe 's/(Requires:\s*fio)([^\n]+)?/$1 = '$FIO'/' $VITASTOR/rpm/vitastor-el$EL.spec
tar --transform 's#^#vitastor-0.8.7/#' --exclude 'rpm/*.rpm' -czf $VITASTOR/../vitastor-0.8.7$(rpm --eval '%dist').tar.gz *
tar --transform 's#^#vitastor-0.8.5/#' --exclude 'rpm/*.rpm' -czf $VITASTOR/../vitastor-0.8.5$(rpm --eval '%dist').tar.gz *

View File

@@ -35,7 +35,7 @@ ADD . /root/vitastor
RUN set -e; \
cd /root/vitastor/rpm; \
sh build-tarball.sh; \
cp /root/vitastor-0.8.7.el7.tar.gz ~/rpmbuild/SOURCES; \
cp /root/vitastor-0.8.5.el7.tar.gz ~/rpmbuild/SOURCES; \
cp vitastor-el7.spec ~/rpmbuild/SPECS/vitastor.spec; \
cd ~/rpmbuild/SPECS/; \
rpmbuild -ba vitastor.spec; \

View File

@@ -1,11 +1,11 @@
Name: vitastor
Version: 0.8.7
Version: 0.8.5
Release: 1%{?dist}
Summary: Vitastor, a fast software-defined clustered block storage
License: Vitastor Network Public License 1.1
URL: https://vitastor.io/
Source0: vitastor-0.8.7.el7.tar.gz
Source0: vitastor-0.8.5.el7.tar.gz
BuildRequires: liburing-devel >= 0.6
BuildRequires: gperftools-devel

View File

@@ -35,7 +35,7 @@ ADD . /root/vitastor
RUN set -e; \
cd /root/vitastor/rpm; \
sh build-tarball.sh; \
cp /root/vitastor-0.8.7.el8.tar.gz ~/rpmbuild/SOURCES; \
cp /root/vitastor-0.8.5.el8.tar.gz ~/rpmbuild/SOURCES; \
cp vitastor-el8.spec ~/rpmbuild/SPECS/vitastor.spec; \
cd ~/rpmbuild/SPECS/; \
rpmbuild -ba vitastor.spec; \

View File

@@ -1,11 +1,11 @@
Name: vitastor
Version: 0.8.7
Version: 0.8.5
Release: 1%{?dist}
Summary: Vitastor, a fast software-defined clustered block storage
License: Vitastor Network Public License 1.1
URL: https://vitastor.io/
Source0: vitastor-0.8.7.el8.tar.gz
Source0: vitastor-0.8.5.el8.tar.gz
BuildRequires: liburing-devel >= 0.6
BuildRequires: gperftools-devel

View File

@@ -1,4 +1,4 @@
cmake_minimum_required(VERSION 2.8.12)
cmake_minimum_required(VERSION 2.8)
project(vitastor)
@@ -16,7 +16,7 @@ if("${CMAKE_INSTALL_PREFIX}" MATCHES "^/usr/local/?$")
set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}")
endif()
add_definitions(-DVERSION="0.8.7")
add_definitions(-DVERSION="0.8.5")
add_definitions(-Wall -Wno-sign-compare -Wno-comment -Wno-parentheses -Wno-pointer-arith -fdiagnostics-color=always -I ${CMAKE_SOURCE_DIR}/src)
if (${WITH_ASAN})
add_definitions(-fsanitize=address -fno-omit-frame-pointer)
@@ -98,7 +98,7 @@ endif (${WITH_FIO})
# libvitastor_common.a
set(MSGR_RDMA "")
if (IBVERBS_LIBRARIES)
set(MSGR_RDMA "msgr_rdma.cpp")
set(MSGR_RDMA msgr_rdma.cpp freelist.cpp allocator.cpp)
endif (IBVERBS_LIBRARIES)
add_library(vitastor_common STATIC
epoll_manager.cpp etcd_state_client.cpp messenger.cpp addr_util.cpp
@@ -278,6 +278,11 @@ add_executable(test_allocator EXCLUDE_FROM_ALL test_allocator.cpp allocator.cpp)
add_dependencies(build_tests test_allocator)
add_test(NAME test_allocator COMMAND test_allocator)
# test_freelist
add_executable(test_freelist EXCLUDE_FROM_ALL test_freelist.cpp)
add_dependencies(build_tests test_freelist)
add_test(NAME test_freelist COMMAND test_freelist)
# test_cas
add_executable(test_cas
test_cas.cpp

View File

@@ -13,11 +13,6 @@ blockstore_t::~blockstore_t()
delete impl;
}
void blockstore_t::parse_config(blockstore_config_t & config)
{
impl->parse_config(config, false);
}
void blockstore_t::loop()
{
impl->loop();

View File

@@ -107,7 +107,7 @@ Input:
- buf = pre-allocated obj_ver_id array <len> units long
Output:
- retval = 0 or negative error number (-ENOENT if no such version for stabilize)
- retval = 0 or negative error number (-EINVAL, -ENOENT if no such version or -EBUSY if not synced)
## BS_OP_SYNC_STAB_ALL
@@ -165,9 +165,6 @@ public:
blockstore_t(blockstore_config_t & config, ring_loop_t *ringloop, timerfd_manager_t *tfd);
~blockstore_t();
// Update configuration
void parse_config(blockstore_config_t & config);
// Event loop
void loop();

View File

@@ -11,7 +11,7 @@ blockstore_impl_t::blockstore_impl_t(blockstore_config_t & config, ring_loop_t *
ring_consumer.loop = [this]() { loop(); };
ringloop->register_consumer(&ring_consumer);
initialized = 0;
parse_config(config, true);
parse_config(config);
zero_object = (uint8_t*)memalign_or_die(MEM_ALIGNMENT, dsk.data_block_size);
try
{
@@ -171,7 +171,7 @@ void blockstore_impl_t::loop()
// Can't submit SYNC before previous writes
continue;
}
wr_st = continue_sync(op);
wr_st = continue_sync(op, false);
if (wr_st != 2)
{
has_writes = wr_st > 0 ? 1 : 2;
@@ -371,18 +371,13 @@ void blockstore_impl_t::enqueue_op(blockstore_op_t *op)
ringloop->set_immediate([op]() { std::function<void (blockstore_op_t*)>(op->callback)(op); });
return;
}
init_op(op);
submit_queue.push_back(op);
ringloop->wakeup();
}
void blockstore_impl_t::init_op(blockstore_op_t *op)
{
// Call constructor without allocating memory. We'll call destructor before returning op back
new ((void*)op->private_data) blockstore_op_private_t;
PRIV(op)->wait_for = 0;
PRIV(op)->op_state = 0;
PRIV(op)->pending_ops = 0;
submit_queue.push_back(op);
ringloop->wakeup();
}
static bool replace_stable(object_id oid, uint64_t version, int search_start, int search_end, obj_ver_id* list)

View File

@@ -216,11 +216,6 @@ struct pool_shard_settings_t
uint32_t pg_stripe_size;
};
#define STAB_SPLIT_DONE 1
#define STAB_SPLIT_WAIT 2
#define STAB_SPLIT_SYNC 3
#define STAB_SPLIT_TODO 4
class blockstore_impl_t
{
blockstore_disk_t dsk;
@@ -282,6 +277,7 @@ class blockstore_impl_t
friend class journal_flusher_t;
friend class journal_flusher_co;
void parse_config(blockstore_config_t & config);
void calc_lengths();
void open_data();
void open_meta();
@@ -303,7 +299,6 @@ class blockstore_impl_t
blockstore_init_journal* journal_init_reader;
void check_wait(blockstore_op_t *op);
void init_op(blockstore_op_t *op);
// Read
int dequeue_read(blockstore_op_t *read_op);
@@ -323,7 +318,7 @@ class blockstore_impl_t
void handle_write_event(ring_data_t *data, blockstore_op_t *op);
// Sync
int continue_sync(blockstore_op_t *op);
int continue_sync(blockstore_op_t *op, bool queue_has_in_progress_sync);
void ack_sync(blockstore_op_t *op);
// Stabilize
@@ -331,8 +326,6 @@ class blockstore_impl_t
int continue_stable(blockstore_op_t *op);
void mark_stable(const obj_ver_id & ov, bool forget_dirty = false);
void stabilize_object(object_id oid, uint64_t max_ver);
blockstore_op_t* selective_sync(blockstore_op_t *op);
int split_stab_op(blockstore_op_t *op, std::function<int(obj_ver_id v)> decider);
// Rollback
int dequeue_rollback(blockstore_op_t *op);
@@ -348,8 +341,6 @@ public:
blockstore_impl_t(blockstore_config_t & config, ring_loop_t *ringloop, timerfd_manager_t *tfd);
~blockstore_impl_t();
void parse_config(blockstore_config_t & config, bool init);
// Event loop
void loop();

View File

@@ -4,54 +4,8 @@
#include <sys/file.h>
#include "blockstore_impl.h"
void blockstore_impl_t::parse_config(blockstore_config_t & config, bool init)
void blockstore_impl_t::parse_config(blockstore_config_t & config)
{
// Online-configurable options:
max_flusher_count = strtoull(config["max_flusher_count"].c_str(), NULL, 10);
if (!max_flusher_count)
{
max_flusher_count = strtoull(config["flusher_count"].c_str(), NULL, 10);
}
min_flusher_count = strtoull(config["min_flusher_count"].c_str(), NULL, 10);
max_write_iodepth = strtoull(config["max_write_iodepth"].c_str(), NULL, 10);
throttle_small_writes = config["throttle_small_writes"] == "true" || config["throttle_small_writes"] == "1" || config["throttle_small_writes"] == "yes";
throttle_target_iops = strtoull(config["throttle_target_iops"].c_str(), NULL, 10);
throttle_target_mbs = strtoull(config["throttle_target_mbs"].c_str(), NULL, 10);
throttle_target_parallelism = strtoull(config["throttle_target_parallelism"].c_str(), NULL, 10);
throttle_threshold_us = strtoull(config["throttle_threshold_us"].c_str(), NULL, 10);
if (!max_flusher_count)
{
max_flusher_count = 256;
}
if (!min_flusher_count || journal.flush_journal)
{
min_flusher_count = 1;
}
if (!max_write_iodepth)
{
max_write_iodepth = 128;
}
if (!throttle_target_iops)
{
throttle_target_iops = 100;
}
if (!throttle_target_mbs)
{
throttle_target_mbs = 100;
}
if (!throttle_target_parallelism)
{
throttle_target_parallelism = 1;
}
if (!throttle_threshold_us)
{
throttle_threshold_us = 50;
}
if (!init)
{
return;
}
// Offline-configurable options:
// Common disk options
dsk.parse_config(config);
// Parse
@@ -90,7 +44,29 @@ void blockstore_impl_t::parse_config(blockstore_config_t & config, bool init)
journal.no_same_sector_overwrites = config["journal_no_same_sector_overwrites"] == "true" ||
config["journal_no_same_sector_overwrites"] == "1" || config["journal_no_same_sector_overwrites"] == "yes";
journal.inmemory = config["inmemory_journal"] != "false";
max_flusher_count = strtoull(config["max_flusher_count"].c_str(), NULL, 10);
if (!max_flusher_count)
max_flusher_count = strtoull(config["flusher_count"].c_str(), NULL, 10);
min_flusher_count = strtoull(config["min_flusher_count"].c_str(), NULL, 10);
max_write_iodepth = strtoull(config["max_write_iodepth"].c_str(), NULL, 10);
throttle_small_writes = config["throttle_small_writes"] == "true" || config["throttle_small_writes"] == "1" || config["throttle_small_writes"] == "yes";
throttle_target_iops = strtoull(config["throttle_target_iops"].c_str(), NULL, 10);
throttle_target_mbs = strtoull(config["throttle_target_mbs"].c_str(), NULL, 10);
throttle_target_parallelism = strtoull(config["throttle_target_parallelism"].c_str(), NULL, 10);
throttle_threshold_us = strtoull(config["throttle_threshold_us"].c_str(), NULL, 10);
// Validate
if (!max_flusher_count)
{
max_flusher_count = 256;
}
if (!min_flusher_count || journal.flush_journal)
{
min_flusher_count = 1;
}
if (!max_write_iodepth)
{
max_write_iodepth = 128;
}
if (journal.sector_count < 2)
{
journal.sector_count = 32;
@@ -115,6 +91,22 @@ void blockstore_impl_t::parse_config(blockstore_config_t & config, bool init)
{
throw std::runtime_error("immediate_commit=all requires disable_journal_fsync and disable_data_fsync");
}
if (!throttle_target_iops)
{
throttle_target_iops = 100;
}
if (!throttle_target_mbs)
{
throttle_target_mbs = 100;
}
if (!throttle_target_parallelism)
{
throttle_target_parallelism = 1;
}
if (!throttle_threshold_us)
{
throttle_threshold_us = 50;
}
// init some fields
journal.block_size = dsk.journal_block_size;
journal.next_free = dsk.journal_block_size;

View File

@@ -9,39 +9,48 @@ int blockstore_impl_t::dequeue_rollback(blockstore_op_t *op)
{
return continue_rollback(op);
}
int r = split_stab_op(op, [this](obj_ver_id ov)
obj_ver_id *v, *nv;
int i, todo = op->len;
for (i = 0, v = (obj_ver_id*)op->buf, nv = (obj_ver_id*)op->buf; i < op->len; i++, v++, nv++)
{
if (nv != v)
{
*nv = *v;
}
// Check that there are some versions greater than v->version (which may be zero),
// check that they're unstable, synced, and not currently written to
auto dirty_it = dirty_db.lower_bound((obj_ver_id){
.oid = ov.oid,
.oid = v->oid,
.version = UINT64_MAX,
});
if (dirty_it == dirty_db.begin())
{
skip_ov:
// Already rolled back, skip this object version
return STAB_SPLIT_DONE;
todo--;
nv--;
continue;
}
else
{
dirty_it--;
if (dirty_it->first.oid != ov.oid || dirty_it->first.version < ov.version)
if (dirty_it->first.oid != v->oid || dirty_it->first.version < v->version)
{
// Already rolled back, skip this object version
return STAB_SPLIT_DONE;
goto skip_ov;
}
while (dirty_it->first.oid == ov.oid && dirty_it->first.version > ov.version)
while (dirty_it->first.oid == v->oid && dirty_it->first.version > v->version)
{
if (IS_IN_FLIGHT(dirty_it->second.state))
{
// Object write is still in progress. Wait until the write request completes
return STAB_SPLIT_WAIT;
return 0;
}
else if (!IS_SYNCED(dirty_it->second.state) ||
IS_STABLE(dirty_it->second.state))
{
// Sync the object
return STAB_SPLIT_SYNC;
op->retval = -EBUSY;
FINISH_OP(op);
return 2;
}
if (dirty_it == dirty_db.begin())
{
@@ -49,16 +58,19 @@ int blockstore_impl_t::dequeue_rollback(blockstore_op_t *op)
}
dirty_it--;
}
return STAB_SPLIT_TODO;
}
});
if (r != 1)
}
op->len = todo;
if (!todo)
{
return r;
// Already rolled back
op->retval = 0;
FINISH_OP(op);
return 2;
}
// Check journal space
blockstore_journal_check_t space_check(this);
if (!space_check.check_available(op, op->len, sizeof(journal_entry_rollback), 0))
if (!space_check.check_available(op, todo, sizeof(journal_entry_rollback), 0))
{
return 0;
}
@@ -66,8 +78,7 @@ int blockstore_impl_t::dequeue_rollback(blockstore_op_t *op)
BS_SUBMIT_CHECK_SQES(space_check.sectors_to_write);
// Prepare and submit journal entries
int s = 0;
auto v = (obj_ver_id*)op->buf;
for (int i = 0; i < op->len; i++, v++)
for (i = 0, v = (obj_ver_id*)op->buf; i < op->len; i++, v++)
{
if (!journal.entry_fits(sizeof(journal_entry_rollback)) &&
journal.sector_info[journal.cur_sector].dirty)

View File

@@ -41,309 +41,60 @@
// 4) after a while it takes his synced object list and sends stabilize requests
// to peers and to its own blockstore, thus freeing the old version
struct ver_vector_t
{
obj_ver_id *items = NULL;
uint64_t alloc = 0, size = 0;
};
static void init_versions(ver_vector_t & vec, obj_ver_id *start, obj_ver_id *end, uint64_t len)
{
if (!vec.items)
{
vec.alloc = len;
vec.items = (obj_ver_id*)malloc_or_die(sizeof(obj_ver_id) * vec.alloc);
for (auto sv = start; sv < end; sv++)
{
vec.items[vec.size++] = *sv;
}
}
}
static void append_version(ver_vector_t & vec, obj_ver_id ov)
{
if (vec.size >= vec.alloc)
{
vec.alloc = !vec.alloc ? 4 : vec.alloc*2;
vec.items = (obj_ver_id*)realloc_or_die(vec.items, sizeof(obj_ver_id) * vec.alloc);
}
vec.items[vec.size++] = ov;
}
static bool check_unsynced(std::vector<obj_ver_id> & check, obj_ver_id ov, std::vector<obj_ver_id> & to, int *count)
{
bool found = false;
int j = 0, k = 0;
while (j < check.size())
{
if (check[j] == ov)
found = true;
if (check[j].oid == ov.oid && check[j].version <= ov.version)
{
to.push_back(check[j++]);
if (count)
(*count)--;
}
else
check[k++] = check[j++];
}
check.resize(k);
return found;
}
blockstore_op_t* blockstore_impl_t::selective_sync(blockstore_op_t *op)
{
unsynced_big_write_count -= unsynced_big_writes.size();
unsynced_big_writes.swap(PRIV(op)->sync_big_writes);
unsynced_big_write_count += unsynced_big_writes.size();
unsynced_small_writes.swap(PRIV(op)->sync_small_writes);
// Create a sync operation, insert into the end of the queue
// And move ourselves into the end too!
// Rather hacky but that's what we need...
blockstore_op_t *sync_op = new blockstore_op_t;
sync_op->opcode = BS_OP_SYNC;
sync_op->buf = NULL;
sync_op->callback = [this](blockstore_op_t *sync_op)
{
delete sync_op;
};
init_op(sync_op);
int sync_res = continue_sync(sync_op);
if (sync_res != 2)
{
// Put SYNC into the queue if it's not finished yet
submit_queue.push_back(sync_op);
}
// Restore unsynced_writes
unsynced_small_writes.swap(PRIV(op)->sync_small_writes);
unsynced_big_write_count -= unsynced_big_writes.size();
unsynced_big_writes.swap(PRIV(op)->sync_big_writes);
unsynced_big_write_count += unsynced_big_writes.size();
if (sync_res == 2)
{
// Sync is immediately completed
return NULL;
}
return sync_op;
}
// Returns: 2 = stop processing and dequeue, 0 = stop processing and do not dequeue, 1 = proceed with op itself
int blockstore_impl_t::split_stab_op(blockstore_op_t *op, std::function<int(obj_ver_id v)> decider)
{
bool add_sync = false;
ver_vector_t good_vers, bad_vers;
obj_ver_id* v;
int i, todo = 0;
for (i = 0, v = (obj_ver_id*)op->buf; i < op->len; i++, v++)
{
int action = decider(*v);
if (action < 0)
{
// Rollback changes
for (auto & ov: PRIV(op)->sync_big_writes)
{
unsynced_big_writes.push_back(ov);
unsynced_big_write_count++;
}
for (auto & ov: PRIV(op)->sync_small_writes)
{
unsynced_small_writes.push_back(ov);
}
free(good_vers.items);
good_vers.items = NULL;
free(bad_vers.items);
bad_vers.items = NULL;
// Error
op->retval = action;
FINISH_OP(op);
return 2;
}
else if (action == STAB_SPLIT_DONE)
{
// Already done
init_versions(good_vers, (obj_ver_id*)op->buf, v, op->len);
}
else if (action == STAB_SPLIT_WAIT)
{
// Already in progress, we just have to wait until it finishes
init_versions(good_vers, (obj_ver_id*)op->buf, v, op->len);
append_version(bad_vers, *v);
}
else if (action == STAB_SPLIT_SYNC)
{
// Needs a SYNC, we have to send a SYNC if not already in progress
//
// If the object is not present in unsynced_(big|small)_writes then
// it's currently being synced. If it's present then we can initiate
// its sync ourselves.
init_versions(good_vers, (obj_ver_id*)op->buf, v, op->len);
append_version(bad_vers, *v);
if (!add_sync)
{
PRIV(op)->sync_big_writes.clear();
PRIV(op)->sync_small_writes.clear();
add_sync = true;
}
check_unsynced(unsynced_small_writes, *v, PRIV(op)->sync_small_writes, NULL);
check_unsynced(unsynced_big_writes, *v, PRIV(op)->sync_big_writes, &unsynced_big_write_count);
}
else /* if (action == STAB_SPLIT_TODO) */
{
if (good_vers.items)
{
// If we're selecting versions then append it
// Main idea is that 99% of the time all versions passed to BS_OP_STABLE are synced
// And we don't want to select/allocate anything in that optimistic case
append_version(good_vers, *v);
}
todo++;
}
}
// In a pessimistic scenario, an operation may be split into 3:
// - Stabilize synced entries
// - Sync unsynced entries
// - Continue for unsynced entries after sync
add_sync = add_sync && (PRIV(op)->sync_big_writes.size() || PRIV(op)->sync_small_writes.size());
if (!todo && !bad_vers.size)
{
// Already stable
op->retval = 0;
FINISH_OP(op);
return 2;
}
op->retval = 0;
if (!todo && !add_sync)
{
// Only wait for inflight writes or current in-progress syncs
return 0;
}
blockstore_op_t *sync_op = NULL, *split_stab_op = NULL;
if (add_sync)
{
// Initiate a selective sync for PRIV(op)->sync_(big|small)_writes
sync_op = selective_sync(op);
}
if (bad_vers.size)
{
// Split part of the request into a separate operation
split_stab_op = new blockstore_op_t;
split_stab_op->opcode = op->opcode;
split_stab_op->buf = bad_vers.items;
split_stab_op->len = bad_vers.size;
init_op(split_stab_op);
submit_queue.push_back(split_stab_op);
}
if (sync_op || split_stab_op || good_vers.items)
{
void *orig_buf = op->buf;
if (good_vers.items)
{
op->buf = good_vers.items;
op->len = good_vers.size;
}
// Make a wrapped callback
int *split_op_counter = (int*)malloc_or_die(sizeof(int));
*split_op_counter = (sync_op ? 1 : 0) + (split_stab_op ? 1 : 0) + (todo ? 1 : 0);
auto cb = [this, op, good_items = good_vers.items,
bad_items = bad_vers.items, split_op_counter,
orig_buf, real_cb = op->callback](blockstore_op_t *split_op)
{
if (split_op->retval != 0)
op->retval = split_op->retval;
(*split_op_counter)--;
assert((*split_op_counter) >= 0);
if (op != split_op)
delete split_op;
if (!*split_op_counter)
{
free(good_items);
free(bad_items);
free(split_op_counter);
op->buf = orig_buf;
real_cb(op);
}
};
if (sync_op)
{
sync_op->callback = cb;
}
if (split_stab_op)
{
split_stab_op->callback = cb;
}
op->callback = cb;
}
if (!todo)
{
// All work is postponed
op->callback = NULL;
return 2;
}
return 1;
}
int blockstore_impl_t::dequeue_stable(blockstore_op_t *op)
{
if (PRIV(op)->op_state)
{
return continue_stable(op);
}
int r = split_stab_op(op, [this](obj_ver_id ov)
obj_ver_id* v;
int i, todo = 0;
for (i = 0, v = (obj_ver_id*)op->buf; i < op->len; i++, v++)
{
auto dirty_it = dirty_db.find(ov);
auto dirty_it = dirty_db.find(*v);
if (dirty_it == dirty_db.end())
{
auto & clean_db = clean_db_shard(ov.oid);
auto clean_it = clean_db.find(ov.oid);
if (clean_it == clean_db.end() || clean_it->second.version < ov.version)
auto & clean_db = clean_db_shard(v->oid);
auto clean_it = clean_db.find(v->oid);
if (clean_it == clean_db.end() || clean_it->second.version < v->version)
{
// No such object version
printf("Error: %lx:%lx v%lu not found while stabilizing\n", ov.oid.inode, ov.oid.stripe, ov.version);
return -ENOENT;
op->retval = -ENOENT;
FINISH_OP(op);
return 2;
}
else
{
// Already stable
return STAB_SPLIT_DONE;
}
}
else if (IS_IN_FLIGHT(dirty_it->second.state))
{
// Object write is still in progress. Wait until the write request completes
return STAB_SPLIT_WAIT;
return 0;
}
else if (!IS_SYNCED(dirty_it->second.state))
{
// Object not synced yet - sync it
// In previous versions we returned EBUSY here and required
// the caller (OSD) to issue a global sync first. But a global sync
// waits for all writes in the queue including inflight writes. And
// inflight writes may themselves be blocked by unstable writes being
// still present in the journal and not flushed away from it.
// So we must sync specific objects here.
//
// Even more, we have to process "stabilize" request in parts. That is,
// we must stabilize all objects which are already synced. Otherwise
// they may block objects which are NOT synced yet.
return STAB_SPLIT_SYNC;
// Object not synced yet. Caller must sync it first
op->retval = -EBUSY;
FINISH_OP(op);
return 2;
}
else if (IS_STABLE(dirty_it->second.state))
else if (!IS_STABLE(dirty_it->second.state))
{
// Already stable
return STAB_SPLIT_DONE;
todo++;
}
else
{
return STAB_SPLIT_TODO;
}
});
if (r != 1)
}
if (!todo)
{
return r;
// Already stable
op->retval = 0;
FINISH_OP(op);
return 2;
}
// Check journal space
blockstore_journal_check_t space_check(this);
if (!space_check.check_available(op, op->len, sizeof(journal_entry_stable), 0))
if (!space_check.check_available(op, todo, sizeof(journal_entry_stable), 0))
{
return 0;
}
@@ -351,9 +102,9 @@ int blockstore_impl_t::dequeue_stable(blockstore_op_t *op)
BS_SUBMIT_CHECK_SQES(space_check.sectors_to_write);
// Prepare and submit journal entries
int s = 0;
auto v = (obj_ver_id*)op->buf;
for (int i = 0; i < op->len; i++, v++)
for (i = 0, v = (obj_ver_id*)op->buf; i < op->len; i++, v++)
{
// FIXME: Only stabilize versions that aren't stable yet
if (!journal.entry_fits(sizeof(journal_entry_stable)) &&
journal.sector_info[journal.cur_sector].dirty)
{

View File

@@ -12,7 +12,7 @@
#define SYNC_JOURNAL_SYNC_SENT 7
#define SYNC_DONE 8
int blockstore_impl_t::continue_sync(blockstore_op_t *op)
int blockstore_impl_t::continue_sync(blockstore_op_t *op, bool queue_has_in_progress_sync)
{
if (immediate_commit == IMMEDIATE_ALL)
{
@@ -145,7 +145,7 @@ int blockstore_impl_t::continue_sync(blockstore_op_t *op)
PRIV(op)->op_state = SYNC_DONE;
}
}
if (PRIV(op)->op_state == SYNC_DONE)
if (PRIV(op)->op_state == SYNC_DONE && !queue_has_in_progress_sync)
{
ack_sync(op);
return 2;

View File

@@ -278,7 +278,7 @@ struct rm_osd_t
if (rsp["response_delete_range"]["deleted"].uint64_value() > 0)
{
// Wait for mon_change_timeout before updating PG history, or the monitor's change will likely interfere with ours
retry_wait = parent->cli->config["mon_change_timeout"].uint64_value();
retry_wait = parent->cli->merged_config["mon_change_timeout"].uint64_value();
if (!retry_wait)
retry_wait = 1000;
retry_wait += etcd_tx_retry_ms;

View File

@@ -198,9 +198,9 @@ resume_2:
}
pgs_by_state_str += std::to_string(kv.second)+" "+kv.first;
}
bool readonly = json_is_true(parent->cli->config["readonly"]);
bool no_recovery = json_is_true(parent->cli->config["no_recovery"]);
bool no_rebalance = json_is_true(parent->cli->config["no_rebalance"]);
bool readonly = json_is_true(parent->cli->merged_config["readonly"]);
bool no_recovery = json_is_true(parent->cli->merged_config["no_recovery"]);
bool no_rebalance = json_is_true(parent->cli->merged_config["no_rebalance"]);
if (parent->json_output)
{
// JSON output

View File

@@ -18,12 +18,11 @@
cluster_client_t::cluster_client_t(ring_loop_t *ringloop, timerfd_manager_t *tfd, json11::Json & config)
{
cli_config = config.object_items();
file_config = osd_messenger_t::read_config(config);
config = osd_messenger_t::merge_configs(cli_config, file_config, etcd_global_config, {});
config = osd_messenger_t::read_config(config);
this->ringloop = ringloop;
this->tfd = tfd;
this->config = config;
msgr.osd_num = 0;
msgr.tfd = tfd;
@@ -59,7 +58,7 @@ cluster_client_t::cluster_client_t(ring_loop_t *ringloop, timerfd_manager_t *tfd
msgr.stop_client(op->peer_fd);
delete op;
};
msgr.parse_config(config);
msgr.parse_config(this->config);
st_cli.tfd = tfd;
st_cli.on_load_config_hook = [this](json11::Json::object & cfg) { on_load_config_hook(cfg); };
@@ -277,10 +276,13 @@ restart:
continuing_ops = 0;
}
void cluster_client_t::on_load_config_hook(json11::Json::object & etcd_global_config)
void cluster_client_t::on_load_config_hook(json11::Json::object & config)
{
this->etcd_global_config = etcd_global_config;
config = osd_messenger_t::merge_configs(cli_config, file_config, etcd_global_config, {});
this->merged_config = config;
for (auto & kv: this->config.object_items())
{
this->merged_config[kv.first] = kv.second;
}
if (config.find("client_max_dirty_bytes") != config.end())
{
client_max_dirty_bytes = config["client_max_dirty_bytes"].uint64_value();
@@ -290,13 +292,14 @@ void cluster_client_t::on_load_config_hook(json11::Json::object & etcd_global_co
// Old name
client_max_dirty_bytes = config["client_dirty_limit"].uint64_value();
}
else
client_max_dirty_bytes = 0;
if (config.find("client_max_dirty_ops") != config.end())
{
client_max_dirty_ops = config["client_max_dirty_ops"].uint64_value();
}
if (!client_max_dirty_bytes)
{
client_max_dirty_bytes = DEFAULT_CLIENT_MAX_DIRTY_BYTES;
}
client_max_dirty_ops = config["client_max_dirty_ops"].uint64_value();
if (!client_max_dirty_ops)
{
client_max_dirty_ops = DEFAULT_CLIENT_MAX_DIRTY_OPS;
@@ -311,7 +314,7 @@ void cluster_client_t::on_load_config_hook(json11::Json::object & etcd_global_co
up_wait_retry_interval = 50;
}
msgr.parse_config(config);
st_cli.parse_config(config);
msgr.parse_config(this->config);
st_cli.load_pgs();
}
@@ -1118,24 +1121,6 @@ void cluster_client_t::handle_op_part(cluster_op_part_t *part)
if (part->op.reply.hdr.retval != expected)
{
// Operation failed, retry
part->flags |= PART_ERROR;
if (!op->retval || op->retval == -EPIPE)
{
// Don't overwrite other errors with -EPIPE
op->retval = part->op.reply.hdr.retval;
}
int stop_fd = -1;
if (op->retval != -EINTR && op->retval != -EIO)
{
stop_fd = part->op.peer_fd;
fprintf(
stderr, "%s operation failed on OSD %lu: retval=%ld (expected %d), dropping connection\n",
osd_op_names[part->op.req.hdr.opcode], part->osd_num, part->op.reply.hdr.retval, expected
);
}
// All next things like timer, continue_sync/rw and stop_client may affect the operation again
// So do all these things after modifying operation state, otherwise we may hit reenterability bugs
// FIXME postpone such things to set_immediate here to avoid bugs
if (part->op.reply.hdr.retval == -EPIPE)
{
// Mark op->up_wait = true before stopping the client
@@ -1149,17 +1134,20 @@ void cluster_client_t::handle_op_part(cluster_op_part_t *part)
});
}
}
if (op->inflight_count == 0)
if (!op->retval || op->retval == -EPIPE)
{
if (op->opcode == OSD_OP_SYNC)
continue_sync(op);
else
continue_rw(op);
// Don't overwrite other errors with -EPIPE
op->retval = part->op.reply.hdr.retval;
}
if (stop_fd >= 0)
if (op->retval != -EINTR && op->retval != -EIO)
{
msgr.stop_client(stop_fd);
fprintf(
stderr, "%s operation failed on OSD %lu: retval=%ld (expected %d), dropping connection\n",
osd_op_names[part->op.req.hdr.opcode], part->osd_num, part->op.reply.hdr.retval, expected
);
msgr.stop_client(part->op.peer_fd);
}
part->flags |= PART_ERROR;
}
else
{
@@ -1173,13 +1161,13 @@ void cluster_client_t::handle_op_part(cluster_op_part_t *part)
copy_part_bitmap(op, part);
op->version = op->parts.size() == 1 ? part->op.reply.rw.version : 0;
}
if (op->inflight_count == 0)
{
if (op->opcode == OSD_OP_SYNC)
continue_sync(op);
else
continue_rw(op);
}
}
if (op->inflight_count == 0)
{
if (op->opcode == OSD_OP_SYNC)
continue_sync(op);
else
continue_rw(op);
}
}

View File

@@ -112,8 +112,8 @@ public:
osd_messenger_t msgr;
void init_msgr();
json11::Json::object cli_config, file_config, etcd_global_config;
json11::Json::object config;
json11::Json config;
json11::Json::object merged_config;
cluster_client_t(ring_loop_t *ringloop, timerfd_manager_t *tfd, json11::Json & config);
~cluster_client_t();

View File

@@ -281,7 +281,7 @@ void disk_tool_t::dump_journal_entry(int num, journal_entry *je, bool json)
if (je->big_write.size > sizeof(journal_entry_big_write))
{
printf(json ? ",\"bitmap\":\"" : " (bitmap: ");
for (int i = sizeof(journal_entry_big_write); i < je->big_write.size; i++)
for (int i = sizeof(journal_entry_big_write); i < je->small_write.size; i++)
{
printf("%02x", ((uint8_t*)je)[i]);
}

View File

@@ -26,7 +26,7 @@ int disk_tool_t::process_meta(std::function<void(blockstore_meta_header_v1_t *)>
buf_size = dsk.meta_len;
void *data = memalign_or_die(MEM_ALIGNMENT, buf_size);
lseek64(dsk.meta_fd, dsk.meta_offset, 0);
read_blocking(dsk.meta_fd, data, dsk.meta_block_size);
read_blocking(dsk.meta_fd, data, buf_size);
// Check superblock
blockstore_meta_header_v1_t *hdr = (blockstore_meta_header_v1_t *)data;
if (hdr->zero == 0 &&
@@ -41,11 +41,8 @@ int disk_tool_t::process_meta(std::function<void(blockstore_meta_header_v1_t *)>
if (buf_size % dsk.meta_block_size)
{
buf_size = 8*dsk.meta_block_size;
void *new_data = memalign_or_die(MEM_ALIGNMENT, buf_size);
memcpy(new_data, data, dsk.meta_block_size);
free(data);
data = new_data;
hdr = (blockstore_meta_header_v1_t *)data;
data = memalign_or_die(MEM_ALIGNMENT, buf_size);
}
}
dsk.bitmap_granularity = hdr->bitmap_granularity;

View File

@@ -18,8 +18,12 @@ etcd_state_client_t::~etcd_state_client_t()
}
watches.clear();
etcd_watches_initialised = -1;
if (ws_keepalive_timer >= 0)
{
tfd->clear_timer(ws_keepalive_timer);
ws_keepalive_timer = -1;
}
#ifndef __MOCK__
stop_ws_keepalive();
if (etcd_watch_ws)
{
http_close(etcd_watch_ws);
@@ -241,7 +245,6 @@ void etcd_state_client_t::parse_config(const json11::Json & config)
if (this->etcd_keepalive_timeout < 30)
this->etcd_keepalive_timeout = 30;
}
auto old_etcd_ws_keepalive_interval = this->etcd_ws_keepalive_interval;
this->etcd_ws_keepalive_interval = config["etcd_ws_keepalive_interval"].uint64_value();
if (this->etcd_ws_keepalive_interval <= 0)
{
@@ -262,13 +265,6 @@ void etcd_state_client_t::parse_config(const json11::Json & config)
{
this->etcd_quick_timeout = 1000;
}
if (this->etcd_ws_keepalive_interval != old_etcd_ws_keepalive_interval && ws_keepalive_timer >= 0)
{
#ifndef __MOCK__
stop_ws_keepalive();
start_ws_keepalive();
#endif
}
}
void etcd_state_client_t::pick_next_etcd()
@@ -482,20 +478,6 @@ void etcd_state_client_t::start_etcd_watcher()
{
on_start_watcher_hook(etcd_watch_ws);
}
start_ws_keepalive();
}
void etcd_state_client_t::stop_ws_keepalive()
{
if (ws_keepalive_timer >= 0)
{
tfd->clear_timer(ws_keepalive_timer);
ws_keepalive_timer = -1;
}
}
void etcd_state_client_t::start_ws_keepalive()
{
if (ws_keepalive_timer < 0)
{
ws_keepalive_timer = tfd->set_timer(etcd_ws_keepalive_interval*1000, true, [this](int)

View File

@@ -132,8 +132,6 @@ public:
void etcd_txn(json11::Json txn, int timeout, int retries, int interval, std::function<void(std::string, json11::Json)> callback);
void etcd_txn_slow(json11::Json txn, std::function<void(std::string, json11::Json)> callback);
void start_etcd_watcher();
void stop_ws_keepalive();
void start_ws_keepalive();
void load_global_config();
void load_pgs();
void parse_state(const etcd_kv_t & kv);

63
src/freelist.cpp Normal file
View File

@@ -0,0 +1,63 @@
// Copyright (c) Vitaliy Filippov, 2023+
// License: VNPL-1.1 or GNU GPL-2.0+ (see README.md for details)
#include <assert.h>
#include "freelist.h"
uint64_t freelist_allocator_t::alloc(uint64_t data_size)
{
for (int i = 0; i < freelist.size(); i++)
{
if (freelist[i].size >= data_size)
{
uint64_t r = freelist[i].start;
freelist[i].start += data_size;
freelist[i].size -= data_size;
return r;
}
}
return UINT64_MAX;
}
void freelist_allocator_t::free(uint64_t start, uint64_t size)
{
int min = 0, max = freelist.size();
if (max && freelist[freelist.size()-1].start < start)
{
min = max;
}
if (max && freelist[0].start >= start)
{
max = 0;
}
while (max-min > 1)
{
int mid = (min+max)/2;
if (freelist[mid].start >= start)
max = mid;
else
min = mid;
}
// max = the first item where freelist[max].start >= start
if (max > 0 && freelist[max-1].start+freelist[max-1].size >= start)
{
assert(freelist[max-1].start+freelist[max-1].size == start);
freelist[max-1].size += size;
}
else if (max < freelist.size() && freelist[max].start <= size+start)
{
assert(freelist[max].start == size+start);
freelist[max].start -= size;
freelist[max].size += size;
}
else
{
freelist.insert(freelist.begin()+min, (freelist_item_t){ .start = start, .size = size });
max = min; // to skip the if below
}
if (min != max && max < freelist.size() && freelist[max].start == freelist[min].start+freelist[min].size)
{
freelist[min].size += freelist[max].size;
freelist.erase(freelist.begin()+max, freelist.begin()+max+1);
}
}

23
src/freelist.h Normal file
View File

@@ -0,0 +1,23 @@
// Copyright (c) Vitaliy Filippov, 2023+
// License: VNPL-1.1 or GNU GPL-2.0+ (see README.md for details)
#pragma once
#include <stdint.h>
#include <vector>
struct freelist_item_t
{
uint64_t start, size;
};
// Really trivial freelist allocator
// Should be fine for remote RDMA memory management because
// most of the time fragmentation shouldn't be an issue as all
// memory regions are short-lived
struct freelist_allocator_t
{
std::vector<freelist_item_t> freelist;
uint64_t alloc(uint64_t data_size);
void free(uint64_t start, uint64_t size);
};

View File

@@ -157,13 +157,16 @@ void osd_messenger_t::parse_config(const json11::Json & config)
this->rdma_max_sge = 128;
this->rdma_max_send = config["rdma_max_send"].uint64_value();
if (!this->rdma_max_send)
this->rdma_max_send = 8;
this->rdma_max_send = 128;
this->rdma_max_recv = config["rdma_max_recv"].uint64_value();
if (!this->rdma_max_recv)
this->rdma_max_recv = 16;
this->rdma_max_msg = config["rdma_max_msg"].uint64_value();
if (!this->rdma_max_msg || this->rdma_max_msg > 128*1024*1024)
this->rdma_max_msg = 129*1024;
this->rdma_max_recv = 128;
this->rdma_op_slots = config["rdma_op_slots"].uint64_value();
if (!this->rdma_op_slots || this->rdma_op_slots >= 1024*1024)
this->rdma_op_slots = 4096;
this->rdma_op_memory = config["rdma_op_memory"].uint64_value();
if (!this->rdma_op_memory || this->rdma_op_memory >= 1024*1024*1024)
this->rdma_op_memory = 16*1024*1024;
#endif
this->receive_buffer_size = (uint32_t)config["tcp_header_buffer_size"].uint64_value();
if (!this->receive_buffer_size || this->receive_buffer_size > 1024*1024*1024)
@@ -388,12 +391,16 @@ void osd_messenger_t::check_peer_config(osd_client_t *cl)
#ifdef WITH_RDMA
if (rdma_context)
{
cl->rdma_conn = msgr_rdma_connection_t::create(rdma_context, rdma_max_send, rdma_max_recv, rdma_max_sge, rdma_max_msg);
cl->rdma_conn = msgr_rdma_connection_t::create(rdma_context, rdma_max_send, rdma_max_recv, rdma_max_sge, rdma_op_slots, rdma_op_memory);
if (cl->rdma_conn)
{
clients_by_qp[cl->rdma_conn->qp->qp_num] = cl->peer_fd;
json11::Json payload = json11::Json::object {
{ "connect_rdma", cl->rdma_conn->addr.to_string() },
{ "rdma_max_msg", cl->rdma_conn->max_msg },
{ "rdma_data_rkey", (uint64_t)cl->rdma_conn->in_data_mr->rkey },
{ "rdma_op_rkey", (uint64_t)cl->rdma_conn->in_op_mr->rkey },
{ "rdma_op_slots", cl->rdma_conn->op_slots },
{ "rdma_op_memory", cl->rdma_conn->op_memory },
};
std::string payload_str = payload.dump();
op->req.show_conf.json_len = payload_str.size();
@@ -453,12 +460,14 @@ void osd_messenger_t::check_peer_config(osd_client_t *cl)
{
msgr_rdma_address_t addr;
if (!msgr_rdma_address_t::from_string(config["rdma_address"].string_value().c_str(), &addr) ||
config["rdma_op_memory"].uint64_value() == 0 ||
cl->rdma_conn->connect(&addr) != 0)
{
fprintf(
stderr, "Failed to connect to OSD %lu (address %s) using RDMA\n",
cl->osd_num, config["rdma_address"].string_value().c_str()
);
clients_by_qp.erase(cl->rdma_conn->qp->qp_num);
delete cl->rdma_conn;
cl->rdma_conn = NULL;
// FIXME: Keep TCP connection in this case
@@ -470,11 +479,12 @@ void osd_messenger_t::check_peer_config(osd_client_t *cl)
}
else
{
uint64_t server_max_msg = config["rdma_max_msg"].uint64_value();
if (cl->rdma_conn->max_msg > server_max_msg)
{
cl->rdma_conn->max_msg = server_max_msg;
}
cl->rdma_conn->set_out_capacity(
config["rdma_data_rkey"].uint64_value(),
config["rdma_op_rkey"].uint64_value(),
config["rdma_op_slots"].uint64_value(),
config["rdma_op_memory"].uint64_value()
);
if (log_level > 0)
{
fprintf(stderr, "Connected to OSD %lu using RDMA\n", cl->osd_num);
@@ -534,9 +544,8 @@ bool osd_messenger_t::is_rdma_enabled()
}
#endif
json11::Json::object osd_messenger_t::read_config(const json11::Json & config)
json11::Json osd_messenger_t::read_config(const json11::Json & config)
{
json11::Json::object file_config;
const char *config_path = config["config_path"].string_value() != ""
? config["config_path"].string_value().c_str() : VITASTOR_CONFIG_PATH;
int fd = open(config_path, O_RDONLY);
@@ -544,14 +553,14 @@ json11::Json::object osd_messenger_t::read_config(const json11::Json & config)
{
if (errno != ENOENT)
fprintf(stderr, "Error reading %s: %s\n", config_path, strerror(errno));
return file_config;
return config;
}
struct stat st;
if (fstat(fd, &st) != 0)
{
fprintf(stderr, "Error reading %s: %s\n", config_path, strerror(errno));
close(fd);
return file_config;
return config;
}
std::string buf;
buf.resize(st.st_size);
@@ -563,125 +572,23 @@ json11::Json::object osd_messenger_t::read_config(const json11::Json & config)
{
fprintf(stderr, "Error reading %s: %s\n", config_path, strerror(errno));
close(fd);
return file_config;
return config;
}
done += r;
}
close(fd);
std::string json_err;
file_config = json11::Json::parse(buf, json_err).object_items();
json11::Json::object file_config = json11::Json::parse(buf, json_err).object_items();
if (json_err != "")
{
fprintf(stderr, "Invalid JSON in %s: %s\n", config_path, json_err.c_str());
return config;
}
file_config.erase("config_path");
file_config.erase("osd_num");
for (auto kv: config.object_items())
{
file_config[kv.first] = kv.second;
}
return file_config;
}
static const char* cli_only_params[] = {
// The list has to be sorted
"bitmap_granularity",
"block_size",
"data_device",
"data_offset",
"data_size",
"disable_data_fsync",
"disable_device_lock",
"disable_journal_fsync",
"disable_meta_fsync",
"disk_alignment",
"flush_journal",
"immediate_commit",
"inmemory_journal",
"inmemory_metadata",
"journal_block_size",
"journal_device",
"journal_no_same_sector_overwrites",
"journal_offset",
"journal_sector_buffer_count",
"journal_size",
"meta_block_size",
"meta_buf_size",
"meta_device",
"meta_offset",
"osd_num",
"readonly",
};
static const char **cli_only_end = cli_only_params + (sizeof(cli_only_params)/sizeof(cli_only_params[0]));
static const char* local_only_params[] = {
// The list has to be sorted
"config_path",
"rdma_device",
"rdma_gid_index",
"rdma_max_msg",
"rdma_max_recv",
"rdma_max_send",
"rdma_max_sge",
"rdma_mtu",
"rdma_port_num",
"tcp_header_buffer_size",
"use_rdma",
"use_sync_send_recv",
};
static const char **local_only_end = local_only_params + (sizeof(local_only_params)/sizeof(local_only_params[0]));
// Basically could be replaced by std::lower_bound()...
static int find_str_array(const char **start, const char **end, const std::string & s)
{
int min = 0, max = end-start;
while (max-min >= 2)
{
int mid = (min+max)/2;
int r = strcmp(s.c_str(), start[mid]);
if (r < 0)
max = mid;
else if (r > 0)
min = mid;
else
return mid;
}
if (min < end-start && !strcmp(s.c_str(), start[min]))
return min;
return -1;
}
json11::Json::object osd_messenger_t::merge_configs(const json11::Json::object & cli_config,
const json11::Json::object & file_config,
const json11::Json::object & etcd_global_config,
const json11::Json::object & etcd_osd_config)
{
// Priority: most important -> less important:
// etcd_osd_config -> cli_config -> etcd_global_config -> file_config
json11::Json::object res = file_config;
for (auto & kv: file_config)
{
int cli_only = find_str_array(cli_only_params, cli_only_end, kv.first);
if (cli_only < 0)
{
res[kv.first] = kv.second;
}
}
for (auto & kv: etcd_global_config)
{
int local_only = find_str_array(local_only_params, local_only_end, kv.first);
if (local_only < 0)
{
res[kv.first] = kv.second;
}
}
for (auto & kv: cli_config)
{
res[kv.first] = kv.second;
}
for (auto & kv: etcd_osd_config)
{
int local_only = find_str_array(local_only_params, local_only_end, kv.first);
if (local_only < 0)
{
res[kv.first] = kv.second;
}
}
return res;
}

View File

@@ -37,6 +37,7 @@
#define MSGR_SENDP_HDR 1
#define MSGR_SENDP_FREE 2
#define MSGR_SENDP_LAST 4
struct msgr_sendp_t
{
@@ -131,9 +132,10 @@ protected:
bool use_rdma = true;
std::string rdma_device;
uint64_t rdma_port_num = 1, rdma_gid_index = 0, rdma_mtu = 0;
msgr_rdma_context_t *rdma_context = NULL;
uint64_t rdma_max_sge = 0, rdma_max_send = 0, rdma_max_recv = 0;
uint64_t rdma_max_msg = 0;
uint64_t rdma_op_slots = 0, rdma_op_memory = 0;
msgr_rdma_context_t *rdma_context = NULL;
std::map<uint32_t, int> clients_by_qp;
#endif
std::vector<int> read_ready_clients;
@@ -166,15 +168,12 @@ public:
void accept_connections(int listen_fd);
~osd_messenger_t();
static json11::Json::object read_config(const json11::Json & config);
static json11::Json::object merge_configs(const json11::Json::object & cli_config,
const json11::Json::object & file_config,
const json11::Json::object & etcd_global_config,
const json11::Json::object & etcd_osd_config);
static json11::Json read_config(const json11::Json & config);
#ifdef WITH_RDMA
bool is_rdma_enabled();
bool connect_rdma(int peer_fd, std::string rdma_address, uint64_t client_max_msg);
bool connect_rdma(int peer_fd, std::string rdma_address,
uint32_t out_data_rkey, uint32_t out_op_rkey, uint64_t out_op_slots, uint64_t out_op_memory);
#endif
protected:
@@ -195,12 +194,13 @@ protected:
bool handle_read_buffer(osd_client_t *cl, void *curbuf, int remain);
bool handle_finished_read(osd_client_t *cl);
void handle_op_hdr(osd_client_t *cl);
bool handle_reply_hdr(osd_client_t *cl);
bool handle_reply_hdr(void *reply_hdr, osd_client_t *cl);
void handle_reply_ready(osd_op_t *op);
#ifdef WITH_RDMA
bool try_send_rdma(osd_client_t *cl);
bool try_recv_rdma(osd_client_t *cl);
void handle_rdma_events();
bool rdma_handle_op(osd_client_t *cl, uint32_t op_slot);
#endif
};

View File

@@ -43,15 +43,7 @@ void osd_messenger_t::send_replies()
{
}
json11::Json::object osd_messenger_t::read_config(const json11::Json & config)
json11::Json osd_messenger_t::read_config(const json11::Json & config)
{
return json11::Json::object();
}
json11::Json::object osd_messenger_t::merge_configs(const json11::Json::object & cli_config,
const json11::Json::object & file_config,
const json11::Json::object & etcd_global_config,
const json11::Json::object & etcd_osd_config)
{
return cli_config;
return config;
}

View File

@@ -46,9 +46,20 @@ msgr_rdma_connection_t::~msgr_rdma_connection_t()
ctx->used_max_cqe -= max_send+max_recv;
if (qp)
ibv_destroy_qp(qp);
if (recv_buffers.size())
for (auto b: recv_buffers)
free(b);
if (in_data_mr)
ibv_dereg_mr(in_data_mr);
if (in_op_mr)
ibv_dereg_mr(in_op_mr);
if (in_data_buf)
free(in_data_buf);
if (in_ops)
free(in_ops);
if (out_op_alloc)
delete out_op_alloc;
if (out_slot_data)
free(out_slot_data);
if (out_slot_ops)
free(out_slot_ops);
}
msgr_rdma_context_t *msgr_rdma_context_t::create(const char *ib_devname, uint8_t ib_port, uint8_t gid_index, uint32_t mtu, int log_level)
@@ -149,7 +160,7 @@ msgr_rdma_context_t *msgr_rdma_context_t::create(const char *ib_devname, uint8_t
ctx->mr = ibv_reg_mr(ctx->pd, NULL, SIZE_MAX, IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_ON_DEMAND);
if (!ctx->mr)
{
fprintf(stderr, "Couldn't register RDMA memory region\n");
fprintf(stderr, "Couldn't register global RDMA memory region: %s\n", strerror(errno));
goto cleanup;
}
@@ -180,7 +191,7 @@ cleanup:
}
msgr_rdma_connection_t *msgr_rdma_connection_t::create(msgr_rdma_context_t *ctx, uint32_t max_send,
uint32_t max_recv, uint32_t max_sge, uint32_t max_msg)
uint32_t max_recv, uint32_t max_sge, uint64_t op_slots, uint64_t op_memory)
{
msgr_rdma_connection_t *conn = new msgr_rdma_connection_t;
@@ -190,7 +201,6 @@ msgr_rdma_connection_t *msgr_rdma_connection_t::create(msgr_rdma_context_t *ctx,
conn->max_send = max_send;
conn->max_recv = max_recv;
conn->max_sge = max_sge;
conn->max_msg = max_msg;
ctx->used_max_cqe += max_send+max_recv;
if (ctx->used_max_cqe > ctx->max_cqe)
@@ -211,6 +221,30 @@ msgr_rdma_connection_t *msgr_rdma_connection_t::create(msgr_rdma_context_t *ctx,
ctx->max_cqe = new_max_cqe;
}
conn->op_memory = op_memory;
conn->in_data_buf = memalign_or_die(MEM_ALIGNMENT, op_memory);
conn->in_data_mr = ibv_reg_mr(ctx->pd, conn->in_data_buf, op_memory,
IBV_ACCESS_ZERO_BASED | IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_ON_DEMAND);
if (!conn->in_data_mr)
{
fprintf(stderr, "Couldn't register %lu MB RDMA memory region for incoming data: %s\n",
(op_memory+1024*1024-1)/1024/1024, strerror(errno));
delete conn;
return NULL;
}
conn->op_slots = op_slots;
conn->in_ops = (msgr_rdma_cmd_t *)malloc_or_die(sizeof(msgr_rdma_cmd_t) * op_slots);
conn->in_op_mr = ibv_reg_mr(ctx->pd, conn->in_ops, sizeof(msgr_rdma_cmd_t) * op_slots,
IBV_ACCESS_ZERO_BASED | IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_ON_DEMAND);
if (!conn->in_op_mr)
{
fprintf(stderr, "Couldn't register %lu KB RDMA memory region for incoming operation headers: %s\n",
(sizeof(msgr_rdma_cmd_t) * op_slots + 1023)/1024, strerror(errno));
delete conn;
return NULL;
}
ibv_qp_init_attr init_attr = {
.send_cq = ctx->cq,
.recv_cq = ctx->cq,
@@ -237,7 +271,7 @@ msgr_rdma_connection_t *msgr_rdma_connection_t::create(msgr_rdma_context_t *ctx,
ibv_qp_attr attr = {
.qp_state = IBV_QPS_INIT,
.qp_access_flags = 0,
.qp_access_flags = IBV_ACCESS_REMOTE_WRITE,
.pkey_index = 0,
.port_num = ctx->ib_port,
};
@@ -265,6 +299,19 @@ static ibv_mtu mtu_to_ibv_mtu(uint32_t mtu)
return IBV_MTU_4096;
}
void msgr_rdma_connection_t::set_out_capacity(uint32_t out_data_rkey, uint32_t out_op_rkey, uint64_t out_op_slots, uint64_t out_op_memory)
{
assert(!out_op_alloc);
this->out_data_rkey = out_data_rkey;
this->out_op_rkey = out_op_rkey;
this->out_op_slots = out_op_slots;
this->out_op_memory = out_op_memory;
out_op_alloc = new allocator(out_op_slots);
out_data_alloc.free(0, out_op_memory);
out_slot_data = (msgr_rdma_out_pos_t *)malloc_or_die(sizeof(msgr_rdma_out_pos_t) * out_op_slots);
out_slot_ops = (osd_op_t **)malloc_or_die(sizeof(osd_op_t *) * out_op_slots);
}
int msgr_rdma_connection_t::connect(msgr_rdma_address_t *dest)
{
auto conn = this;
@@ -311,17 +358,14 @@ int msgr_rdma_connection_t::connect(msgr_rdma_address_t *dest)
return 0;
}
bool osd_messenger_t::connect_rdma(int peer_fd, std::string rdma_address, uint64_t client_max_msg)
bool osd_messenger_t::connect_rdma(int peer_fd, std::string rdma_address,
uint32_t out_data_rkey, uint32_t out_op_rkey, uint64_t out_op_slots, uint64_t out_op_memory)
{
// Try to connect to the peer using RDMA
msgr_rdma_address_t addr;
if (msgr_rdma_address_t::from_string(rdma_address.c_str(), &addr))
{
if (client_max_msg > rdma_max_msg)
{
client_max_msg = rdma_max_msg;
}
auto rdma_conn = msgr_rdma_connection_t::create(rdma_context, rdma_max_send, rdma_max_recv, rdma_max_sge, client_max_msg);
auto rdma_conn = msgr_rdma_connection_t::create(rdma_context, rdma_max_send, rdma_max_recv, rdma_max_sge, rdma_op_slots, rdma_op_memory);
if (rdma_conn)
{
int r = rdma_conn->connect(&addr);
@@ -336,6 +380,8 @@ bool osd_messenger_t::connect_rdma(int peer_fd, std::string rdma_address, uint64
else
{
// Remember connection, but switch to RDMA only after sending the configuration response
clients_by_qp[rdma_conn->qp->qp_num] = peer_fd;
rdma_conn->set_out_capacity(out_data_rkey, out_op_rkey, out_op_slots, out_op_memory);
auto cl = clients.at(peer_fd);
cl->rdma_conn = rdma_conn;
cl->peer_state = PEER_RDMA_CONNECTING;
@@ -346,83 +392,172 @@ bool osd_messenger_t::connect_rdma(int peer_fd, std::string rdma_address, uint64
return false;
}
static void try_send_rdma_wr(osd_client_t *cl, ibv_sge *sge, int op_sge)
{
ibv_send_wr *bad_wr = NULL;
ibv_send_wr wr = {
.wr_id = (uint64_t)(cl->peer_fd*2+1),
.sg_list = sge,
.num_sge = op_sge,
.opcode = IBV_WR_SEND,
.send_flags = IBV_SEND_SIGNALED,
};
int err = ibv_post_send(cl->rdma_conn->qp, &wr, &bad_wr);
if (err || bad_wr)
{
fprintf(stderr, "RDMA send failed: %s\n", strerror(err));
exit(1);
}
cl->rdma_conn->cur_send++;
}
bool osd_messenger_t::try_send_rdma(osd_client_t *cl)
{
auto rc = cl->rdma_conn;
if (!cl->send_list.size() && !rc->in_slots_freed.size() || rc->cur_send >= rc->max_send)
{
return true;
}
int i = 0;
while (i < rc->in_slots_freed.size())
{
auto op_slot = rc->in_slots_freed[i++];
assert(op_slot < 0x80000000);
ibv_send_wr *bad_wr = NULL;
ibv_send_wr wr = {
.wr_id = 0,
.opcode = IBV_WR_RDMA_WRITE_WITH_IMM,
.imm_data = 0x80000000 | op_slot,
};
int err = ibv_post_send(cl->rdma_conn->qp, &wr, &bad_wr);
if (err || bad_wr)
{
fprintf(stderr, "RDMA send failed: %s\n", strerror(err));
exit(1);
}
rc->cur_send++;
if (rc->cur_send >= rc->max_send)
{
break;
}
}
rc->in_slots_freed.erase(rc->in_slots_freed.begin(), rc->in_slots_freed.begin()+i);
if (!cl->send_list.size() || rc->cur_send >= rc->max_send)
{
return true;
}
uint64_t op_size = 0, op_sge = 0;
ibv_sge sge[rc->max_sge];
while (rc->send_pos < cl->send_list.size())
int op_start = 0;
while (op_start < cl->send_list.size())
{
iovec & iov = cl->send_list[rc->send_pos];
if (op_size >= rc->max_msg || op_sge >= rc->max_sge)
uint64_t op_data_size = 0;
int op_end = op_start;
while (!(cl->outbox[op_end].flags & MSGR_SENDP_LAST))
{
rc->send_sizes.push_back(op_size);
try_send_rdma_wr(cl, sge, op_sge);
op_sge = 0;
op_size = 0;
if (rc->cur_send >= rc->max_send)
op_data_size += cl->send_list[op_end].iov_len;
op_end++;
}
op_data_size += cl->send_list[op_end].iov_len;
op_end++;
op_data_size -= cl->send_list[op_start].iov_len;
// Operation boundaries in send_list: op_start..op_end, first iovec is the header
uint64_t op_slot = rc->out_op_alloc->find_free();
if (op_slot == UINT64_MAX)
{
// op queue is full
return true;
}
uint64_t data_pos = UINT64_MAX;
if (op_data_size >= 0)
{
if (rc->cur_send > rc->max_send-1-(op_end-op_start-1+rc->max_sge)/rc->max_sge)
{
break;
// RDMA queue is full
return true;
}
// FIXME: Oops, and what if op data is larger than the whole buffer... :)
data_pos = rc->out_data_alloc.alloc(op_data_size);
if (data_pos == UINT64_MAX)
{
// data buffers are full
return true;
}
int cur_sge = 0;
for (int data_sent = 1; data_sent < op_end; data_sent++)
{
sge[cur_sge++] = {
.addr = (uintptr_t)cl->send_list[data_sent].iov_base,
.length = (uint32_t)cl->send_list[data_sent].iov_len,
.lkey = rc->ctx->mr->lkey,
};
if (data_sent == op_end-1 || cur_sge >= rc->max_sge)
{
ibv_send_wr *bad_wr = NULL;
ibv_send_wr wr = {
.wr_id = op_slot,
.next = NULL,
.sg_list = sge,
.num_sge = cur_sge,
.opcode = IBV_WR_RDMA_WRITE,
.send_flags = 0,
.wr = {
.rdma = {
.remote_addr = data_pos,
.rkey = rc->out_data_rkey,
},
},
};
int err = ibv_post_send(cl->rdma_conn->qp, &wr, &bad_wr);
if (err || bad_wr)
{
fprintf(stderr, "RDMA send failed: %s\n", strerror(err));
exit(1);
}
rc->cur_send++;
cur_sge = 0;
}
}
}
uint32_t len = (uint32_t)(op_size+iov.iov_len-rc->send_buf_pos < rc->max_msg
? iov.iov_len-rc->send_buf_pos : rc->max_msg-op_size);
sge[op_sge++] = {
.addr = (uintptr_t)((uint8_t*)iov.iov_base+rc->send_buf_pos),
.length = len,
if (rc->cur_send > rc->max_send-1)
{
// RDMA queue is full
return true;
}
rc->out_op_alloc->set(op_slot, true);
assert(cl->send_list[op_start].iov_len == OSD_PACKET_SIZE);
sge[0] = {
.addr = (uintptr_t)cl->send_list[op_start].iov_base,
.length = (uint32_t)cl->send_list[op_start].iov_len,
.lkey = rc->ctx->mr->lkey,
};
op_size += len;
rc->send_buf_pos += len;
if (rc->send_buf_pos >= iov.iov_len)
rc->out_slot_data[op_slot] = { .data_pos = data_pos, .data_size = op_data_size };
rc->out_slot_ops[op_slot] = (cl->outbox[op_end-1].flags & MSGR_SENDP_FREE)
? cl->outbox[op_end-1].op : NULL;
sge[1] = {
.addr = (uintptr_t)(rc->out_slot_data+op_slot),
.length = sizeof(rc->out_slot_data[op_slot]),
.lkey = rc->ctx->mr->lkey,
};
ibv_send_wr *bad_wr = NULL;
ibv_send_wr wr = {
.wr_id = op_slot,
.next = NULL,
.sg_list = sge,
.num_sge = 2,
.opcode = IBV_WR_RDMA_WRITE_WITH_IMM,
.send_flags = IBV_SEND_SIGNALED,
.imm_data = (uint32_t)op_slot,
.wr = {
.rdma = {
.remote_addr = op_slot*sizeof(msgr_rdma_cmd_t),
.rkey = rc->out_op_rkey,
},
},
};
int err = ibv_post_send(cl->rdma_conn->qp, &wr, &bad_wr);
if (err || bad_wr)
{
rc->send_pos++;
rc->send_buf_pos = 0;
fprintf(stderr, "RDMA send failed: %s\n", strerror(err));
exit(1);
}
rc->cur_send++;
op_start = op_end;
}
if (op_sge > 0)
if (op_start > 0)
{
rc->send_sizes.push_back(op_size);
try_send_rdma_wr(cl, sge, op_sge);
cl->send_list.erase(cl->send_list.begin(), cl->send_list.begin()+op_start);
}
return true;
}
static void try_recv_rdma_wr(osd_client_t *cl, void *buf)
static void try_recv_rdma_wr(osd_client_t *cl, ibv_sge *sge, int op_sge)
{
ibv_sge sge = {
.addr = (uintptr_t)buf,
.length = (uint32_t)cl->rdma_conn->max_msg,
.lkey = cl->rdma_conn->ctx->mr->lkey,
};
ibv_recv_wr *bad_wr = NULL;
ibv_recv_wr wr = {
.wr_id = (uint64_t)(cl->peer_fd*2),
.sg_list = &sge,
.num_sge = 1,
.sg_list = sge,
.num_sge = op_sge,
};
int err = ibv_post_recv(cl->rdma_conn->qp, &wr, &bad_wr);
if (err || bad_wr)
@@ -433,18 +568,87 @@ static void try_recv_rdma_wr(osd_client_t *cl, void *buf)
cl->rdma_conn->cur_recv++;
}
static void copy_data_to_recv_list(uint8_t *data_buf, uint64_t data_size, osd_client_t *cl)
{
uint64_t pos = 0;
while (cl->recv_list.done < cl->recv_list.count)
{
uint64_t cur = cl->recv_list.buf[cl->recv_list.done].iov_len;
assert(cur <= data_size-pos);
memcpy(cl->recv_list.buf[cl->recv_list.done].iov_base, data_buf+pos, cur);
pos += cur;
}
cl->recv_list.reset();
}
bool osd_messenger_t::try_recv_rdma(osd_client_t *cl)
{
auto rc = cl->rdma_conn;
while (rc->cur_recv < rc->max_recv)
{
void *buf = malloc_or_die(rc->max_msg);
rc->recv_buffers.push_back(buf);
try_recv_rdma_wr(cl, buf);
try_recv_rdma_wr(cl, NULL, 0);
}
return true;
}
bool osd_messenger_t::rdma_handle_op(osd_client_t *cl, uint32_t op_slot)
{
auto rc = cl->rdma_conn;
if (op_slot >= rc->in_op_cap)
{
// Invalid incoming index
fprintf(stderr, "Client %d invalid incoming RDMA op slot: %u, dropping connection\n", cl->peer_fd, op_slot);
stop_client(cl->peer_fd);
return false;
}
osd_op_header_t *hdr = (osd_op_header_t *)rc->in_ops[op_slot].header;
uint8_t *data_buf = (uint8_t*)rc->in_data_buf + rc->in_ops[op_slot].pos.data_pos;
uint64_t data_size = rc->in_ops[op_slot].pos.data_size;
if (hdr->magic == SECONDARY_OSD_REPLY_MAGIC)
{
// Reply
if (cl->read_op)
{
delete cl->read_op;
cl->read_op = NULL;
}
if (!handle_reply_hdr(rc->in_ops[op_slot].header, cl))
return false;
if (cl->read_state == CL_READ_REPLY_DATA)
{
// copy reply data to cl->recv_list
copy_data_to_recv_list(data_buf, data_size, cl);
// and handle reply with data
handle_reply_ready(cl->read_op);
cl->read_op = NULL;
cl->read_state = 0;
cl->read_remaining = 0;
}
}
else
{
// Operation
cl->read_op = new osd_op_t;
cl->read_op->peer_fd = cl->peer_fd;
cl->read_op->op_type = OSD_OP_IN;
memcpy(&cl->read_op->req, hdr, OSD_PACKET_SIZE);
handle_op_hdr(cl);
if (cl->read_state == CL_READ_DATA)
{
copy_data_to_recv_list(data_buf, data_size, cl);
// And handle the incoming op with data
cl->received_ops.push_back(cl->read_op);
set_immediate.push_back([this, op = cl->read_op]() { exec_op(op); });
cl->read_op = NULL;
cl->read_state = 0;
}
}
// We don't need the incoming data buffer anymore, notify peer about it
// FIXME: Allow to pass memory to the internal layer without copying and notify after handling it
rc->in_slots_freed.push_back(op_slot);
return true;
}
#define RDMA_EVENTS_AT_ONCE 32
void osd_messenger_t::handle_rdma_events()
@@ -469,80 +673,60 @@ void osd_messenger_t::handle_rdma_events()
event_count = ibv_poll_cq(rdma_context->cq, RDMA_EVENTS_AT_ONCE, wc);
for (int i = 0; i < event_count; i++)
{
int client_id = wc[i].wr_id >> 1;
bool is_send = wc[i].wr_id & 1;
auto cl_it = clients.find(client_id);
auto cqp_it = clients_by_qp.find(wc[i].qp_num);
int peer_fd = cqp_it != clients_by_qp.end() ? cqp_it->second : -1;
auto cl_it = clients.find(peer_fd);
if (cl_it == clients.end())
{
continue;
}
osd_client_t *cl = cl_it->second;
auto rc = cl->rdma_conn;
if (wc[i].status != IBV_WC_SUCCESS)
{
fprintf(stderr, "RDMA work request failed for client %d", client_id);
fprintf(stderr, "RDMA work request failed for client %d", peer_fd);
if (cl->osd_num)
{
fprintf(stderr, " (OSD %lu)", cl->osd_num);
}
fprintf(stderr, " with status: %s, stopping client\n", ibv_wc_status_str(wc[i].status));
stop_client(client_id);
if (peer_fd >= 0)
stop_client(peer_fd);
continue;
}
if (!is_send)
auto rc = cl->rdma_conn;
if (wc[i].opcode == IBV_WC_RDMA_WRITE)
{
rc->cur_recv--;
if (!handle_read_buffer(cl, rc->recv_buffers[rc->next_recv_buf], wc[i].byte_len))
// Operation or reply is sent, we can free it
auto & op = rc->out_slot_ops[wc[i].wr_id];
if (op)
{
// handle_read_buffer may stop the client
continue;
delete op;
op = NULL;
}
try_recv_rdma_wr(cl, rc->recv_buffers[rc->next_recv_buf]);
rc->next_recv_buf = (rc->next_recv_buf+1) % rc->recv_buffers.size();
}
else
{
rc->cur_send--;
uint64_t sent_size = rc->send_sizes.at(0);
rc->send_sizes.erase(rc->send_sizes.begin(), rc->send_sizes.begin()+1);
int send_pos = 0, send_buf_pos = 0;
while (sent_size > 0)
try_send_rdma(cl);
}
else if (wc[i].opcode == IBV_WC_RECV)
{
if (!(wc[i].imm_data & 0x80000000))
{
if (sent_size >= cl->send_list.at(send_pos).iov_len)
// Operation or reply received. Handle it
if (!rdma_handle_op(cl, wc[i].imm_data))
{
sent_size -= cl->send_list[send_pos].iov_len;
send_pos++;
}
else
{
send_buf_pos = sent_size;
sent_size = 0;
// false means that the client is stopped due to invalid operation
continue;
}
rc->cur_recv--;
try_recv_rdma(cl);
}
assert(rc->send_pos >= send_pos);
if (rc->send_pos == send_pos)
else
{
rc->send_buf_pos -= send_buf_pos;
}
rc->send_pos -= send_pos;
for (int i = 0; i < send_pos; i++)
{
if (cl->outbox[i].flags & MSGR_SENDP_FREE)
{
// Reply fully sent
delete cl->outbox[i].op;
}
}
if (send_pos > 0)
{
cl->send_list.erase(cl->send_list.begin(), cl->send_list.begin()+send_pos);
cl->outbox.erase(cl->outbox.begin(), cl->outbox.begin()+send_pos);
}
if (send_buf_pos > 0)
{
cl->send_list[0].iov_base = (uint8_t*)cl->send_list[0].iov_base + send_buf_pos;
cl->send_list[0].iov_len -= send_buf_pos;
// Outbox slot is marked as free (the remote side doesn't need it anymore)
uint32_t op_slot = wc[i].imm_data & 0x7FFFFFFF;
auto & pos = rc->in_ops[op_slot].pos;
if (pos.data_size > 0)
rc->out_data_alloc.free(pos.data_pos, pos.data_size);
rc->out_op_alloc->set(op_slot, false);
}
// Try to continue sending
try_send_rdma(cl);
}
}

View File

@@ -5,6 +5,11 @@
#include <infiniband/verbs.h>
#include <string>
#include <vector>
#include "allocator.h"
#include "freelist.h"
#include "osd_ops.h"
struct osd_op_t;
struct msgr_rdma_address_t
{
@@ -39,6 +44,17 @@ struct msgr_rdma_context_t
~msgr_rdma_context_t();
};
struct msgr_rdma_out_pos_t
{
uint64_t data_pos, data_size;
};
struct msgr_rdma_cmd_t
{
uint8_t header[OSD_PACKET_SIZE];
msgr_rdma_out_pos_t pos;
};
struct msgr_rdma_connection_t
{
msgr_rdma_context_t *ctx = NULL;
@@ -46,14 +62,24 @@ struct msgr_rdma_connection_t
msgr_rdma_address_t addr;
int max_send = 0, max_recv = 0, max_sge = 0;
int cur_send = 0, cur_recv = 0;
uint64_t max_msg = 0;
uint64_t op_slots = 0, op_memory = 0;
int send_pos = 0, send_buf_pos = 0;
int next_recv_buf = 0;
std::vector<void*> recv_buffers;
std::vector<uint64_t> send_sizes;
ibv_mr *in_data_mr = NULL, *in_op_mr = NULL;
msgr_rdma_cmd_t *in_ops = NULL;
int in_op_cap = 0;
void *in_data_buf = NULL;
std::vector<uint32_t> in_slots_freed;
uint32_t out_data_rkey = 0, out_op_rkey = 0;
uint64_t out_op_slots = 0, out_op_memory = 0;
allocator *out_op_alloc = NULL;
freelist_allocator_t out_data_alloc;
msgr_rdma_out_pos_t *out_slot_data = NULL;
osd_op_t **out_slot_ops = NULL;
~msgr_rdma_connection_t();
static msgr_rdma_connection_t *create(msgr_rdma_context_t *ctx, uint32_t max_send, uint32_t max_recv, uint32_t max_sge, uint32_t max_msg);
static msgr_rdma_connection_t *create(msgr_rdma_context_t *ctx, uint32_t max_send,
uint32_t max_recv, uint32_t max_sge, uint64_t op_slots, uint64_t op_memory);
int connect(msgr_rdma_address_t *dest);
void set_out_capacity(uint32_t out_data_rkey, uint32_t out_op_rkey, uint64_t out_op_slots, uint64_t out_op_memory);
};

View File

@@ -172,7 +172,7 @@ bool osd_messenger_t::handle_finished_read(osd_client_t *cl)
if (cl->read_state == CL_READ_HDR)
{
if (cl->read_op->req.hdr.magic == SECONDARY_OSD_REPLY_MAGIC)
return handle_reply_hdr(cl);
return handle_reply_hdr(cl->read_op->req.buf, cl);
else if (cl->read_op->req.hdr.magic == SECONDARY_OSD_OP_MAGIC)
handle_op_hdr(cl);
else
@@ -286,7 +286,7 @@ void osd_messenger_t::handle_op_hdr(osd_client_t *cl)
}
}
bool osd_messenger_t::handle_reply_hdr(osd_client_t *cl)
bool osd_messenger_t::handle_reply_hdr(void *reply_hdr, osd_client_t *cl)
{
auto req_it = cl->sent_ops.find(cl->read_op->req.hdr.id);
if (req_it == cl->sent_ops.end())
@@ -297,7 +297,7 @@ bool osd_messenger_t::handle_reply_hdr(osd_client_t *cl)
return false;
}
osd_op_t *op = req_it->second;
memcpy(op->reply.buf, cl->read_op->req.buf, OSD_PACKET_SIZE);
memcpy(op->reply.buf, reply_hdr, OSD_PACKET_SIZE);
cl->sent_ops.erase(req_it);
if (op->reply.hdr.opcode == OSD_OP_SEC_READ || op->reply.hdr.opcode == OSD_OP_READ)
{
@@ -328,14 +328,16 @@ bool osd_messenger_t::handle_reply_hdr(osd_client_t *cl)
{
goto reuse;
}
delete cl->read_op;
if (cl->read_op)
delete cl->read_op;
cl->read_op = op;
cl->read_state = CL_READ_REPLY_DATA;
}
else if (op->reply.hdr.opcode == OSD_OP_SEC_LIST && op->reply.hdr.retval > 0)
{
assert(!op->iov.count);
delete cl->read_op;
if (cl->read_op)
delete cl->read_op;
cl->read_op = op;
cl->read_state = CL_READ_REPLY_DATA;
cl->read_remaining = sizeof(obj_ver_id) * op->reply.hdr.retval;
@@ -345,7 +347,8 @@ bool osd_messenger_t::handle_reply_hdr(osd_client_t *cl)
else if (op->reply.hdr.opcode == OSD_OP_SEC_READ_BMP && op->reply.hdr.retval > 0)
{
assert(!op->iov.count);
delete cl->read_op;
if (cl->read_op)
delete cl->read_op;
cl->read_op = op;
cl->read_state = CL_READ_REPLY_DATA;
cl->read_remaining = op->reply.hdr.retval;
@@ -355,7 +358,8 @@ bool osd_messenger_t::handle_reply_hdr(osd_client_t *cl)
}
else if (op->reply.hdr.opcode == OSD_OP_SHOW_CONFIG && op->reply.hdr.retval > 0)
{
delete cl->read_op;
if (cl->read_op)
delete cl->read_op;
cl->read_op = op;
cl->read_state = CL_READ_REPLY_DATA;
cl->read_remaining = op->reply.hdr.retval;
@@ -368,7 +372,8 @@ bool osd_messenger_t::handle_reply_hdr(osd_client_t *cl)
reuse:
// It's fine to reuse cl->read_op for the next reply
handle_reply_ready(op);
cl->recv_list.push_back(cl->read_op->req.buf, OSD_PACKET_SIZE);
if (cl->read_op)
cl->recv_list.push_back(cl->read_op->req.buf, OSD_PACKET_SIZE);
cl->read_remaining = OSD_PACKET_SIZE;
cl->read_state = CL_READ_HDR;
}

View File

@@ -96,6 +96,7 @@ void osd_messenger_t::outbox_push(osd_op_t *cur_op)
to_send_list.push_back((iovec){ .iov_base = cur_op->buf, .iov_len = (size_t)cur_op->req.sec_read_bmp.len });
to_outbox.push_back((msgr_sendp_t){ .op = cur_op, .flags = 0 });
}
to_outbox[to_outbox.size()-1].flags |= MSGR_SENDP_LAST;
if (cur_op->op_type == OSD_OP_IN)
{
to_outbox[to_outbox.size()-1].flags |= MSGR_SENDP_FREE;

View File

@@ -129,6 +129,7 @@ void osd_messenger_t::stop_client(int peer_fd, bool force, bool force_delete)
#ifdef WITH_RDMA
if (cl->rdma_conn)
{
clients_by_qp.erase(cl->rdma_conn->qp->qp_num);
delete cl->rdma_conn;
}
#endif

View File

@@ -39,11 +39,6 @@ struct __attribute__((__packed__)) obj_ver_id
uint64_t version;
};
inline bool operator == (const obj_ver_id & a, const obj_ver_id & b)
{
return a.oid == b.oid && a.version == b.version;
}
inline bool operator < (const obj_ver_id & a, const obj_ver_id & b)
{
return a.oid < b.oid || a.oid == b.oid && a.version < b.version;

View File

@@ -35,18 +35,18 @@ osd_t::osd_t(const json11::Json & config, ring_loop_t *ringloop)
this->ringloop = ringloop;
this->cli_config = config.object_items();
this->file_config = msgr.read_config(this->cli_config);
parse_config(true);
this->config = msgr.read_config(config).object_items();
if (this->config.find("log_level") == this->config.end())
this->config["log_level"] = 1;
parse_config(this->config, true);
epmgr = new epoll_manager_t(ringloop);
// FIXME: Use timerfd_interval based directly on io_uring
this->tfd = epmgr->tfd;
if (!json_is_true(this->config["disable_blockstore"]))
auto bs_cfg = json_to_bs(this->config);
this->bs = new blockstore_t(bs_cfg, ringloop, tfd);
{
auto bs_cfg = json_to_bs(this->config);
this->bs = new blockstore_t(bs_cfg, ringloop, tfd);
// Autosync based on the number of unstable writes to prevent stalls due to insufficient journal space
uint64_t max_autosync = bs->get_journal_size() / bs->get_block_size() / 2;
if (autosync_writes > max_autosync)
@@ -67,11 +67,11 @@ osd_t::osd_t(const json11::Json & config, ring_loop_t *ringloop)
}
}
print_stats_timer_id = this->tfd->set_timer(print_stats_interval*1000, true, [this](int timer_id)
this->tfd->set_timer(print_stats_interval*1000, true, [this](int timer_id)
{
print_stats();
});
slow_log_timer_id = this->tfd->set_timer(slow_log_interval*1000, true, [this](int timer_id)
this->tfd->set_timer(slow_log_interval*1000, true, [this](int timer_id)
{
print_slow();
});
@@ -91,42 +91,18 @@ osd_t::osd_t(const json11::Json & config, ring_loop_t *ringloop)
osd_t::~osd_t()
{
if (slow_log_timer_id >= 0)
{
tfd->clear_timer(slow_log_timer_id);
slow_log_timer_id = -1;
}
if (print_stats_timer_id >= 0)
{
tfd->clear_timer(print_stats_timer_id);
print_stats_timer_id = -1;
}
if (autosync_timer_id >= 0)
{
tfd->clear_timer(autosync_timer_id);
autosync_timer_id = -1;
}
ringloop->unregister_consumer(&consumer);
delete epmgr;
if (bs)
delete bs;
delete bs;
close(listen_fd);
free(zero_buffer);
}
void osd_t::parse_config(bool init)
void osd_t::parse_config(const json11::Json & config, bool allow_disk_params)
{
config = msgr.merge_configs(cli_config, file_config, etcd_global_config, etcd_osd_config);
if (config.find("log_level") == this->config.end())
config["log_level"] = 1;
if (bs)
{
auto bs_cfg = json_to_bs(config);
bs->parse_config(bs_cfg);
}
st_cli.parse_config(config);
msgr.parse_config(config);
if (init)
if (allow_disk_params)
{
// OSD number
osd_num = config["osd_num"].uint64_value();
@@ -148,27 +124,24 @@ void osd_t::parse_config(bool init)
immediate_commit = IMMEDIATE_SMALL;
else
immediate_commit = IMMEDIATE_NONE;
// Bind address
bind_address = config["bind_address"].string_value();
if (bind_address == "")
bind_address = "0.0.0.0";
bind_port = config["bind_port"].uint64_value();
if (bind_port <= 0 || bind_port > 65535)
bind_port = 0;
// OSD configuration
etcd_report_interval = config["etcd_report_interval"].uint64_value();
if (etcd_report_interval <= 0)
etcd_report_interval = 5;
readonly = json_is_true(config["readonly"]);
run_primary = !json_is_false(config["run_primary"]);
allow_test_ops = json_is_true(config["allow_test_ops"]);
}
// Bind address
bind_address = config["bind_address"].string_value();
if (bind_address == "")
bind_address = "0.0.0.0";
bind_port = config["bind_port"].uint64_value();
if (bind_port <= 0 || bind_port > 65535)
bind_port = 0;
// OSD configuration
log_level = config["log_level"].uint64_value();
auto old_no_rebalance = no_rebalance;
etcd_report_interval = config["etcd_report_interval"].uint64_value();
if (etcd_report_interval <= 0)
etcd_report_interval = 5;
readonly = json_is_true(config["readonly"]);
run_primary = !json_is_false(config["run_primary"]);
no_rebalance = json_is_true(config["no_rebalance"]);
auto old_no_recovery = no_recovery;
no_recovery = json_is_true(config["no_recovery"]);
auto old_autosync_interval = autosync_interval;
allow_test_ops = json_is_true(config["allow_test_ops"]);
if (!config["autosync_interval"].is_null())
{
// Allow to set it to 0
@@ -196,46 +169,15 @@ void osd_t::parse_config(bool init)
recovery_sync_batch = config["recovery_sync_batch"].uint64_value();
if (recovery_sync_batch < 1 || recovery_sync_batch > MAX_RECOVERY_QUEUE)
recovery_sync_batch = DEFAULT_RECOVERY_BATCH;
auto old_print_stats_interval = print_stats_interval;
print_stats_interval = config["print_stats_interval"].uint64_value();
if (!print_stats_interval)
print_stats_interval = 3;
auto old_slow_log_interval = slow_log_interval;
slow_log_interval = config["slow_log_interval"].uint64_value();
if (!slow_log_interval)
slow_log_interval = 10;
inode_vanish_time = config["inode_vanish_time"].uint64_value();
if (!inode_vanish_time)
inode_vanish_time = 60;
if ((old_no_rebalance && !no_rebalance || old_no_recovery && !no_recovery) &&
!(peering_state & (OSD_RECOVERING | OSD_FLUSHING_PGS)))
{
peering_state = peering_state | OSD_RECOVERING;
}
if (old_autosync_interval != autosync_interval && autosync_timer_id >= 0)
{
this->tfd->clear_timer(autosync_timer_id);
autosync_timer_id = this->tfd->set_timer(autosync_interval*1000, true, [this](int timer_id)
{
autosync();
});
}
if (old_print_stats_interval != print_stats_interval && print_stats_timer_id >= 0)
{
tfd->clear_timer(print_stats_timer_id);
print_stats_timer_id = this->tfd->set_timer(print_stats_interval*1000, true, [this](int timer_id)
{
print_stats();
});
}
if (old_slow_log_interval != slow_log_interval && slow_log_timer_id >= 0)
{
tfd->clear_timer(slow_log_timer_id);
slow_log_timer_id = this->tfd->set_timer(slow_log_interval*1000, true, [this](int timer_id)
{
print_slow();
});
}
}
void osd_t::bind_socket()
@@ -533,7 +475,7 @@ void osd_t::print_slow()
}
}
}
if (has_slow && bs)
if (has_slow)
{
bs->dump_diagnostics();
}

View File

@@ -90,7 +90,7 @@ class osd_t
{
// config
json11::Json::object cli_config, file_config, etcd_global_config, etcd_osd_config, config;
json11::Json::object config;
int etcd_report_interval = 5;
bool readonly = false;
@@ -126,7 +126,6 @@ class osd_t
bool pg_config_applied = false;
bool etcd_reporting_pg_state = false;
bool etcd_reporting_stats = false;
int autosync_timer_id = -1, print_stats_timer_id = -1, slow_log_timer_id = -1;
// peers and PGs
@@ -153,7 +152,7 @@ class osd_t
bool stopping = false;
int inflight_ops = 0;
blockstore_t *bs = NULL;
blockstore_t *bs;
void *zero_buffer = NULL;
uint64_t zero_buffer_size = 0;
uint32_t bs_block_size, bs_bitmap_granularity, clean_entry_bitmap_size;
@@ -174,7 +173,7 @@ class osd_t
uint64_t recovery_stat_bytes[2][2] = {};
// cluster connection
void parse_config(bool init);
void parse_config(const json11::Json & config, bool allow_disk_params);
void init_cluster();
void on_change_osd_state_hook(osd_num_t peer_osd);
void on_change_pg_history_hook(pool_id_t pool_id, pg_num_t pg_num);

View File

@@ -75,7 +75,7 @@ void osd_t::init_cluster()
}
if (run_primary && autosync_interval > 0)
{
autosync_timer_id = this->tfd->set_timer(autosync_interval*1000, true, [this](int timer_id)
this->tfd->set_timer(autosync_interval*1000, true, [this](int timer_id)
{
autosync();
});
@@ -182,10 +182,10 @@ json11::Json osd_t::get_statistics()
char time_str[50] = { 0 };
sprintf(time_str, "%ld.%03ld", ts.tv_sec, ts.tv_nsec/1000000);
st["time"] = time_str;
st["blockstore_ready"] = bs->is_started();
st["data_block_size"] = (uint64_t)bs->get_block_size();
if (bs)
{
st["blockstore_ready"] = bs->is_started();
st["data_block_size"] = (uint64_t)bs->get_block_size();
st["size"] = bs->get_block_count() * bs->get_block_size();
st["free"] = bs->get_free_block_count() * bs->get_block_size();
}
@@ -233,8 +233,7 @@ void osd_t::report_statistics()
json11::Json::object inode_space;
json11::Json::object last_stat;
pool_id_t last_pool = 0;
std::map<uint64_t, uint64_t> bs_empty_space;
auto & bs_inode_space = bs ? bs->get_inode_space_stats() : bs_empty_space;
auto & bs_inode_space = bs->get_inode_space_stats();
for (auto kv: bs_inode_space)
{
pool_id_t pool_id = INODE_POOL(kv.first);
@@ -375,11 +374,7 @@ void osd_t::on_change_osd_state_hook(osd_num_t peer_osd)
void osd_t::on_change_etcd_state_hook(std::map<std::string, etcd_kv_t> & changes)
{
if (changes.find(st_cli.etcd_prefix+"/config/global") != changes.end())
{
etcd_global_config = changes[st_cli.etcd_prefix+"/config/global"].value.object_items();
parse_config(false);
}
// FIXME apply config changes in runtime (maybe, some)
if (run_primary)
{
apply_pg_count();
@@ -389,8 +384,11 @@ void osd_t::on_change_etcd_state_hook(std::map<std::string, etcd_kv_t> & changes
void osd_t::on_load_config_hook(json11::Json::object & global_config)
{
etcd_global_config = global_config;
parse_config(true);
json11::Json::object osd_config = this->config;
for (auto & kv: global_config)
if (osd_config.find(kv.first) == osd_config.end())
osd_config[kv.first] = kv.second;
parse_config(osd_config, false);
bind_socket();
acquire_lease();
}

View File

@@ -64,11 +64,6 @@ void osd_t::submit_pg_flush_ops(pg_t & pg)
void osd_t::handle_flush_op(bool rollback, pool_id_t pool_id, pg_num_t pg_num, pg_flush_batch_t *fb, osd_num_t peer_osd, int retval)
{
if (log_level > 2)
{
printf("[PG %u/%u] flush batch %lx completed on OSD %lu with result %d\n",
pool_id, pg_num, (uint64_t)fb, peer_osd, retval);
}
pool_pg_num_t pg_id = { .pool_id = pool_id, .pg_num = pg_num };
if (pgs.find(pg_id) == pgs.end() || pgs[pg_id].flush_batch != fb)
{
@@ -104,9 +99,10 @@ void osd_t::handle_flush_op(bool rollback, pool_id_t pool_id, pg_num_t pg_num, p
std::vector<osd_op_t*> continue_ops;
auto & pg = pgs.at(pg_id);
auto it = pg.flush_actions.begin(), prev_it = it;
auto erase_start = it;
while (1)
{
if (it == pg.flush_actions.end() || !it->second.submitted ||
if (it == pg.flush_actions.end() ||
it->first.oid.inode != prev_it->first.oid.inode ||
(it->first.oid.stripe & ~STRIPE_MASK) != (prev_it->first.oid.stripe & ~STRIPE_MASK))
{
@@ -120,23 +116,29 @@ void osd_t::handle_flush_op(bool rollback, pool_id_t pool_id, pg_num_t pg_num, p
});
if (wr_it != pg.write_queue.end())
{
if (log_level > 2)
{
printf("[PG %u/%u] continuing write %lx to object %lx:%lx after flush\n",
pool_id, pg_num, (uint64_t)wr_it->second, wr_it->first.inode, wr_it->first.stripe);
}
continue_ops.push_back(wr_it->second);
pg.write_queue.erase(wr_it);
}
}
if (it == pg.flush_actions.end() || !it->second.submitted)
if ((it == pg.flush_actions.end() || !it->second.submitted) &&
erase_start != it)
{
pg.flush_actions.erase(erase_start, it);
}
if (it == pg.flush_actions.end())
{
if (it != pg.flush_actions.begin())
{
pg.flush_actions.erase(pg.flush_actions.begin(), it);
}
break;
}
prev_it = it++;
prev_it = it;
if (!it->second.submitted)
{
it++;
erase_start = it;
}
else
{
it++;
}
}
delete fb;
pg.flush_batch = NULL;
@@ -166,18 +168,6 @@ bool osd_t::submit_flush_op(pool_id_t pool_id, pg_num_t pg_num, pg_flush_batch_t
// Copy buffer so it gets freed along with the operation
op->buf = malloc_or_die(sizeof(obj_ver_id) * count);
memcpy(op->buf, data, sizeof(obj_ver_id) * count);
if (log_level > 2)
{
printf(
"[PG %u/%u] flush batch %lx on OSD %lu: %s objects: ",
pool_id, pg_num, (uint64_t)fb, peer_osd, rollback ? "rollback" : "stabilize"
);
for (int i = 0; i < count; i++)
{
printf(i > 0 ? ", %lx:%lx v%lu" : "%lx:%lx v%lu", data[i].oid.inode, data[i].oid.stripe, data[i].version);
}
printf("\n");
}
if (peer_osd == this->osd_num)
{
// local
@@ -314,10 +304,9 @@ void osd_t::submit_recovery_op(osd_recovery_op_t *op)
{
// PG is stopped or one of the OSDs is gone, error is harmless
printf(
"[PG %u/%u] Recovery operation failed with object %lx:%lx\n",
INODE_POOL(op->oid.inode),
map_to_pg(op->oid, st_cli.pool_config.at(INODE_POOL(op->oid.inode)).pg_stripe_size),
op->oid.inode, op->oid.stripe
"Recovery operation failed with object %lx:%lx (PG %u/%u)\n",
op->oid.inode, op->oid.stripe, INODE_POOL(op->oid.inode),
map_to_pg(op->oid, st_cli.pool_config.at(INODE_POOL(op->oid.inode)).pg_stripe_size)
);
}
else

View File

@@ -76,7 +76,7 @@ void osd_t::handle_peers()
peering_state = peering_state & ~OSD_FLUSHING_PGS | OSD_RECOVERING;
}
}
if (!(peering_state & OSD_FLUSHING_PGS) && (peering_state & OSD_RECOVERING) && !readonly)
if ((peering_state & OSD_RECOVERING) && !readonly)
{
if (!continue_recovery())
{

View File

@@ -91,7 +91,7 @@ void pg_obj_state_check_t::walk()
pg->state |= PG_DEGRADED;
}
pg->state |= PG_ACTIVE;
if (pg->cur_peers.size() < pg->all_peers.size())
if (pg->state == PG_ACTIVE && pg->cur_peers.size() < pg->all_peers.size())
{
pg->state |= PG_LEFT_ON_DEAD;
}

View File

@@ -53,10 +53,7 @@ void osd_t::finish_op(osd_op_t *cur_op, int retval)
inode_stats[cur_op->req.rw.inode].op_count[inode_st_op]++;
inode_stats[cur_op->req.rw.inode].op_sum[inode_st_op] += usec;
if (cur_op->req.hdr.opcode == OSD_OP_DELETE)
{
if (cur_op->op_data)
inode_stats[cur_op->req.rw.inode].op_bytes[inode_st_op] += cur_op->op_data->pg_data_size * bs_block_size;
}
inode_stats[cur_op->req.rw.inode].op_bytes[inode_st_op] += cur_op->op_data->pg_data_size * bs_block_size;
else
inode_stats[cur_op->req.rw.inode].op_bytes[inode_st_op] += cur_op->req.rw.len;
}

View File

@@ -166,7 +166,7 @@ resume_6:
for (int i = 0; i < unstable_osd.len; i++)
{
// Except those from peered PGs
auto & w = op_data->unstable_writes[unstable_osd.start + i];
auto & w = op_data->unstable_writes[i];
pool_pg_num_t wpg = {
.pool_id = INODE_POOL(w.oid.inode),
.pg_num = map_to_pg(w.oid, st_cli.pool_config.at(INODE_POOL(w.oid.inode)).pg_stripe_size),

View File

@@ -12,7 +12,6 @@ bool osd_t::check_write_queue(osd_op_t *cur_op, pg_t & pg)
.oid = op_data->oid,
.osd_num = 0,
});
op_data->st = 1;
if (act_it != pg.flush_actions.end() &&
act_it->first.oid.inode == op_data->oid.inode &&
(act_it->first.oid.stripe & ~STRIPE_MASK) == op_data->oid.stripe)
@@ -24,6 +23,7 @@ bool osd_t::check_write_queue(osd_op_t *cur_op, pg_t & pg)
auto vo_it = pg.write_queue.find(op_data->oid);
if (vo_it != pg.write_queue.end())
{
op_data->st = 1;
pg.write_queue.emplace(op_data->oid, cur_op);
return false;
}

View File

@@ -142,11 +142,11 @@ inline bool operator < (const reed_sol_erased_t &a, const reed_sol_erased_t &b)
for (int i = 0; i < a.size && i < b.size; i++)
{
if (a.data[i] < b.data[i])
return true;
return -1;
else if (a.data[i] > b.data[i])
return false;
return 1;
}
return false;
return 0;
}
struct reed_sol_matrix_t
@@ -677,11 +677,11 @@ void* calc_rmw(void *request_buf, osd_rmw_stripe_t *stripes, uint64_t *read_osd_
static void get_old_new_buffers(osd_rmw_stripe_t & stripe, uint32_t wr_start, uint32_t wr_end, buf_len_t *bufs, int & nbufs)
{
uint32_t ns = 0, ne = 0, os = 0, oe = 0;
if (stripe.write_end > wr_start &&
stripe.write_start < wr_end)
if (stripe.req_end > wr_start &&
stripe.req_start < wr_end)
{
ns = std::max(stripe.write_start, wr_start);
ne = std::min(stripe.write_end, wr_end);
ns = std::max(stripe.req_start, wr_start);
ne = std::min(stripe.req_end, wr_end);
}
if (stripe.read_end > wr_start &&
stripe.read_start < wr_end)
@@ -692,7 +692,7 @@ static void get_old_new_buffers(osd_rmw_stripe_t & stripe, uint32_t wr_start, ui
if (ne && (!oe || ns <= os))
{
// NEW or NEW->OLD
bufs[nbufs++] = { .buf = (uint8_t*)stripe.write_buf + ns - stripe.write_start, .len = ne-ns };
bufs[nbufs++] = { .buf = (uint8_t*)stripe.write_buf + ns - stripe.req_start, .len = ne-ns };
if (os < ne)
os = ne;
if (oe > os)
@@ -708,7 +708,7 @@ static void get_old_new_buffers(osd_rmw_stripe_t & stripe, uint32_t wr_start, ui
{
// OLD->NEW or OLD->NEW->OLD
bufs[nbufs++] = { .buf = (uint8_t*)stripe.read_buf + os - stripe.read_start, .len = ns-os };
bufs[nbufs++] = { .buf = (uint8_t*)stripe.write_buf + ns - stripe.write_start, .len = ne-ns };
bufs[nbufs++] = { .buf = (uint8_t*)stripe.write_buf + ns - stripe.req_start, .len = ne-ns };
if (oe > ne)
{
// OLD->NEW->OLD
@@ -759,18 +759,7 @@ static void calc_rmw_parity_copy_mod(osd_rmw_stripe_t *stripes, int pg_size, int
uint64_t *read_osd_set, uint64_t *write_osd_set, uint32_t chunk_size, uint32_t bitmap_granularity,
uint32_t &start, uint32_t &end)
{
bool required = false;
for (int role = pg_minsize; role < pg_size; role++)
{
if (write_osd_set[role] != 0)
{
// Whole parity chunk is needed when we move the object
if (write_osd_set[role] != read_osd_set[role])
end = chunk_size;
required = true;
}
}
if (required && end != chunk_size)
if (write_osd_set[pg_minsize] != 0 || write_osd_set != read_osd_set)
{
// start & end are required for calc_rmw_parity
for (int role = 0; role < pg_minsize; role++)
@@ -781,6 +770,14 @@ static void calc_rmw_parity_copy_mod(osd_rmw_stripe_t *stripes, int pg_size, int
end = std::max(stripes[role].req_end, end);
}
}
for (int role = pg_minsize; role < pg_size; role++)
{
if (write_osd_set[role] != 0 && write_osd_set[role] != read_osd_set[role])
{
start = 0;
end = chunk_size;
}
}
}
// Set bitmap bits accordingly
if (bitmap_granularity > 0)

View File

@@ -17,7 +17,6 @@ void test4();
void test5();
void test6();
void test7();
void test_rmw_4k_degraded_into_lost_to_normal(bool ec);
void test8();
void test9();
void test10();
@@ -25,7 +24,7 @@ void test11();
void test12();
void test13();
void test14();
void test15(bool second);
void test15();
void test16();
int main(int narg, char *args[])
@@ -40,8 +39,6 @@ int main(int narg, char *args[])
test6();
// Test 7
test7();
test_rmw_4k_degraded_into_lost_to_normal(false);
test_rmw_4k_degraded_into_lost_to_normal(true);
// Test 8
test8();
// Test 9
@@ -57,8 +54,7 @@ int main(int narg, char *args[])
// Test 14
test14();
// Test 15
test15(false);
test15(true);
test15();
// Test 16
test16();
// End
@@ -319,69 +315,6 @@ void test7()
/***
7/2. calc_rmw(offset=48K, len=4K, osd_set=[0,2,3], write_set=[1,2,3])
= {
read: [ [ 0, 128K ], [ 0, 128K ], [ 0, 128K ] ],
write: [ [ 48K, 52K ], [ 0, 0 ], [ 48K, 52K ] ],
input buffer: [ write0 ],
rmw buffer: [ write2, read0, read1, read2 ],
}
then, after calc_rmw_parity_xor/ec(): {
write: [ [ 0, 128K ], [ 0, 0 ], [ 48K, 52K ] ],
write0==read0,
}
+ check write0, write2 buffers
***/
void test_rmw_4k_degraded_into_lost_to_normal(bool ec)
{
osd_num_t osd_set[3] = { 0, 2, 3 };
osd_num_t write_osd_set[3] = { 1, 2, 3 };
osd_rmw_stripe_t stripes[3] = {};
// Subtest 1
split_stripes(2, 128*1024, 48*1024, 4096, stripes);
void *write_buf = malloc(4096);
void *rmw_buf = calc_rmw(write_buf, stripes, osd_set, 3, 2, 3, write_osd_set, 128*1024, 0);
assert(stripes[0].read_start == 0 && stripes[0].read_end == 128*1024);
assert(stripes[1].read_start == 0 && stripes[1].read_end == 128*1024);
assert(stripes[2].read_start == 0 && stripes[2].read_end == 128*1024);
assert(stripes[0].write_start == 48*1024 && stripes[0].write_end == 52*1024);
assert(stripes[1].write_start == 0 && stripes[1].write_end == 0);
assert(stripes[2].write_start == 48*1024 && stripes[2].write_end == 52*1024);
assert(stripes[0].read_buf == (uint8_t*)rmw_buf+4*1024);
assert(stripes[1].read_buf == (uint8_t*)rmw_buf+4*1024+128*1024);
assert(stripes[2].read_buf == (uint8_t*)rmw_buf+4*1024+2*128*1024);
assert(stripes[0].write_buf == write_buf);
assert(stripes[1].write_buf == NULL);
assert(stripes[2].write_buf == rmw_buf);
// Subtest 2
set_pattern(write_buf, 4096, PATTERN2);
set_pattern(stripes[1].read_buf, 128*1024, PATTERN1);
set_pattern(stripes[2].read_buf, 128*1024, PATTERN0^PATTERN1);
if (!ec)
calc_rmw_parity_xor(stripes, 3, osd_set, write_osd_set, 128*1024, 0);
else
{
use_ec(3, 2, true);
calc_rmw_parity_ec(stripes, 3, 2, osd_set, write_osd_set, 128*1024, 0);
use_ec(3, 2, false);
}
assert(stripes[0].write_start == 0 && stripes[0].write_end == 128*1024);
assert(stripes[1].write_start == 0 && stripes[1].write_end == 0);
assert(stripes[2].write_start == 48*1024 && stripes[2].write_end == 52*1024);
assert(stripes[0].write_buf == stripes[0].read_buf);
assert(stripes[1].write_buf == NULL);
assert(stripes[2].write_buf == rmw_buf);
check_pattern(stripes[0].write_buf, 4096, PATTERN0);
check_pattern(stripes[0].write_buf+48*1024, 4096, PATTERN2);
check_pattern(stripes[2].write_buf, 4096, PATTERN2^PATTERN1); // new parity
free(rmw_buf);
free(write_buf);
}
/***
8. calc_rmw(offset=0, len=128K+4K, osd_set=[0,2,3], write_set=[1,2,3])
= {
read: [ [ 0, 0 ], [ 4K, 128K ], [ 0, 0 ] ],
@@ -893,11 +826,12 @@ void test14()
***/
void test15(bool second)
void test15()
{
const int bmp = 64*1024 / 4096 / 8;
use_ec(4, 2, true);
osd_num_t osd_set[4] = { 1, 2, (osd_num_t)(second ? 0 : 3), (osd_num_t)(second ? 4 : 0) };
osd_num_t osd_set[4] = { 1, 2, 3, 0 };
osd_num_t write_osd_set[4] = { 1, 2, 3, 0 };
osd_rmw_stripe_t stripes[4] = {};
unsigned bitmaps[4] = { 0 };
// Test 15.0
@@ -908,7 +842,7 @@ void test15(bool second)
assert(stripes[2].req_start == 0 && stripes[2].req_end == 0);
assert(stripes[3].req_start == 0 && stripes[3].req_end == 0);
// Test 15.1
void *rmw_buf = calc_rmw(write_buf, stripes, osd_set, 4, 2, 3, osd_set, 64*1024, bmp);
void *rmw_buf = calc_rmw(write_buf, stripes, osd_set, 4, 2, 3, write_osd_set, 64*1024, bmp);
for (int i = 0; i < 4; i++)
stripes[i].bmp_buf = bitmaps+i;
assert(rmw_buf);
@@ -918,34 +852,32 @@ void test15(bool second)
assert(stripes[3].read_start == 0 && stripes[3].read_end == 0);
assert(stripes[0].write_start == 0 && stripes[0].write_end == 0);
assert(stripes[1].write_start == 28*1024 && stripes[1].write_end == 32*1024);
assert(stripes[2+second].write_start == 28*1024 && stripes[2+second].write_end == 32*1024);
assert(stripes[3-second].write_start == 0 && stripes[3-second].write_end == 0);
assert(stripes[2].write_start == 28*1024 && stripes[2].write_end == 32*1024);
assert(stripes[3].write_start == 0 && stripes[3].write_end == 0);
assert(stripes[0].read_buf == (uint8_t*)rmw_buf+4*1024);
assert(stripes[1].read_buf == NULL);
assert(stripes[2].read_buf == NULL);
assert(stripes[3].read_buf == NULL);
assert(stripes[0].write_buf == NULL);
assert(stripes[1].write_buf == (uint8_t*)write_buf);
assert(stripes[2+second].write_buf == rmw_buf);
assert(stripes[3-second].write_buf == NULL);
assert(stripes[2].write_buf == rmw_buf);
assert(stripes[3].write_buf == NULL);
// Test 15.2 - encode
set_pattern(write_buf, 4*1024, PATTERN1);
set_pattern(stripes[0].read_buf, 4*1024, PATTERN2);
memset(stripes[0].bmp_buf, 0, bmp);
memset(stripes[1].bmp_buf, 0, bmp);
memset(stripes[2+second].write_buf, 0, 4096);
calc_rmw_parity_ec(stripes, 4, 2, osd_set, osd_set, 64*1024, bmp);
assert(second || *(uint32_t*)stripes[2].bmp_buf == 0x80);
calc_rmw_parity_ec(stripes, 4, 2, osd_set, write_osd_set, 64*1024, bmp);
assert(*(uint32_t*)stripes[2].bmp_buf == 0x80);
assert(stripes[0].write_start == 0 && stripes[0].write_end == 0);
assert(stripes[1].write_start == 28*1024 && stripes[1].write_end == 32*1024);
assert(stripes[2+second].write_start == 28*1024 && stripes[2+second].write_end == 32*1024);
assert(stripes[3-second].write_start == 0 && stripes[3-second].write_end == 0);
assert(stripes[2].write_start == 28*1024 && stripes[2].write_end == 32*1024);
assert(stripes[3].write_start == 0 && stripes[3].write_end == 0);
assert(stripes[0].write_buf == NULL);
assert(stripes[1].write_buf == (uint8_t*)write_buf);
assert(stripes[2+second].write_buf == rmw_buf);
assert(stripes[3-second].write_buf == NULL);
// first parity is always xor :), second isn't...
check_pattern(stripes[2+second].write_buf, 4*1024, second ? 0xb79a59a0ce8b9b81 : PATTERN1^PATTERN2);
assert(stripes[2].write_buf == rmw_buf);
assert(stripes[3].write_buf == NULL);
check_pattern(stripes[2].write_buf, 4*1024, PATTERN1^PATTERN2); // first parity is always xor :)
// Done
free(rmw_buf);
free(write_buf);

View File

@@ -166,15 +166,20 @@ void osd_t::exec_show_config(osd_op_t *cur_op)
{
// Indicate that RDMA is enabled
wire_config["rdma_enabled"] = true;
if (req_json["connect_rdma"].is_string())
if (req_json["connect_rdma"].is_string() && req_json["rdma_op_memory"].uint64_value() != 0)
{
// Peer is trying to connect using RDMA, try to satisfy him
bool ok = msgr.connect_rdma(cur_op->peer_fd, req_json["connect_rdma"].string_value(), req_json["rdma_max_msg"].uint64_value());
bool ok = msgr.connect_rdma(cur_op->peer_fd, req_json["connect_rdma"].string_value(),
req_json["rdma_data_rkey"].uint64_value(), req_json["rdma_op_rkey"].uint64_value(),
req_json["rdma_op_slots"].uint64_value(), req_json["rdma_op_memory"].uint64_value());
if (ok)
{
auto rc = msgr.clients.at(cur_op->peer_fd)->rdma_conn;
wire_config["rdma_address"] = rc->addr.to_string();
wire_config["rdma_max_msg"] = rc->max_msg;
wire_config["rdma_data_rkey"] = (uint64_t)rc->in_data_mr->rkey;
wire_config["rdma_op_rkey"] = (uint64_t)rc->in_op_mr->rkey;
wire_config["rdma_op_slots"] = rc->op_slots;
wire_config["rdma_op_memory"] = rc->op_memory;
}
}
}

View File

@@ -150,7 +150,6 @@ int connect_osd(const char *osd_address, int osd_port)
if (connect(connect_fd, (sockaddr*)&addr, sizeof(addr)) < 0)
{
perror("connect");
close(connect_fd);
return -1;
}
int one = 1;

View File

@@ -15,7 +15,7 @@ int read_blocking(int fd, void *read_buf, size_t remaining)
size_t done = 0;
while (done < remaining)
{
ssize_t r = read(fd, read_buf, remaining-done);
size_t r = read(fd, read_buf, remaining-done);
if (r <= 0)
{
if (!errno)
@@ -41,7 +41,7 @@ int write_blocking(int fd, void *write_buf, size_t remaining)
size_t done = 0;
while (done < remaining)
{
ssize_t r = write(fd, write_buf, remaining-done);
size_t r = write(fd, write_buf, remaining-done);
if (r < 0)
{
if (errno != EINTR && errno != EAGAIN && errno != EPIPE)

View File

@@ -83,7 +83,6 @@ int connect_stub(const char *server_address, int server_port)
if (connect(connect_fd, (sockaddr*)&addr, sizeof(addr)) < 0)
{
perror("connect");
close(connect_fd);
return -1;
}
int one = 1;

64
src/test_freelist.cpp Normal file
View File

@@ -0,0 +1,64 @@
// Copyright (c) Vitaliy Filippov, 2023+
// License: VNPL-1.1 or GNU GPL-2.0+ (see README.md for details)
#include <stdio.h>
#include <stdlib.h>
#include <stdexcept>
#include "freelist.cpp"
inline bool operator == (const freelist_item_t & a, const freelist_item_t & b)
{
return a.start == b.start && a.size == b.size;
}
void dump(std::vector<freelist_item_t> & freelist)
{
printf("free: ");
for (auto & item: freelist)
{
printf("%lx+%lx ", item.start, item.size);
}
printf("\n");
}
void dump(freelist_allocator_t &alloc)
{
dump(alloc.freelist);
}
uint64_t test_alloc(freelist_allocator_t &alloc, uint64_t size)
{
uint64_t r = alloc.alloc(size);
printf("alloc %lx: %lx\n", size, r);
return r;
}
void assert_eq(freelist_allocator_t &alloc, std::vector<freelist_item_t> v)
{
if (alloc.freelist != v)
{
printf("expected ");
dump(v);
printf("got ");
dump(alloc);
throw std::runtime_error("test failed");
}
dump(alloc);
}
int main(int narg, char *args[])
{
freelist_allocator_t alloc;
alloc.free(0, 0x1000000);
assert_eq(alloc, { { 0, 0x1000000 } });
assert(test_alloc(alloc, 0x1000) == 0);
assert_eq(alloc, { { 0x1000, 0xfff000 } });
assert(test_alloc(alloc, 0x4000) == 0x1000);
alloc.free(0x1000000, 0x4000);
assert_eq(alloc, { { 0x5000, 0xfff000 } });
alloc.free(0, 0x1000);
assert_eq(alloc, { { 0, 0x1000 }, { 0x5000, 0xfff000 } });
alloc.free(0x1000, 0x4000);
assert_eq(alloc, { { 0, 0x1004000 } });
return 0;
}

View File

@@ -6,7 +6,7 @@ includedir=${prefix}/@CMAKE_INSTALL_INCLUDEDIR@
Name: Vitastor
Description: Vitastor client library
Version: 0.8.7
Version: 0.8.5
Libs: -L${libdir} -lvitastor_client
Cflags: -I${includedir}

View File

@@ -64,4 +64,4 @@ echo leak:librbd >> testdata/lsan-suppress.txt
echo leak:_M_mutate >> testdata/lsan-suppress.txt
echo leak:_M_assign >> testdata/lsan-suppress.txt
export LSAN_OPTIONS=report_objects=true:suppressions=`pwd`/testdata/lsan-suppress.txt
export ASAN_OPTIONS=verify_asan_link_order=false:abort_on_error=1
export ASAN_OPTIONS=verify_asan_link_order=false

View File

@@ -17,17 +17,17 @@ else
fi
if [ "$IMMEDIATE_COMMIT" != "" ]; then
NO_SAME="--journal_no_same_sector_overwrites true --journal_sector_buffer_count 1024 --disable_data_fsync 1 --immediate_commit all --log_level 10"
NO_SAME="--journal_no_same_sector_overwrites true --journal_sector_buffer_count 1024 --disable_data_fsync 1 --immediate_commit all --log_level 1"
$ETCDCTL put /vitastor/config/global '{"recovery_queue_depth":1,"osd_out_time":1,"immediate_commit":"all"}'
else
NO_SAME="--journal_sector_buffer_count 1024 --log_level 10"
NO_SAME="--journal_sector_buffer_count 1024 --log_level 1"
$ETCDCTL put /vitastor/config/global '{"recovery_queue_depth":1,"osd_out_time":1}'
fi
start_osd()
{
local i=$1
build/src/vitastor-osd --osd_num $i --bind_address 127.0.0.1 $NO_SAME $OSD_ARGS --etcd_address $ETCD_URL $(build/src/vitastor-disk simple-offsets --format options ./testdata/test_osd$i.bin 2>/dev/null) >>./testdata/osd$i.log 2>&1 &
build/src/vitastor-osd --osd_num $i --bind_address 127.0.0.1 $NO_SAME $OSD_ARGS --etcd_address $ETCD_URL $(build/src/vitastor-disk simple-offsets --format options ./testdata/test_osd$i.bin 2>/dev/null) &>./testdata/osd$i.log &
eval OSD${i}_PID=$!
}

View File

@@ -43,6 +43,3 @@ SCHEME=ec ./test_snapshot.sh
SCHEME=xor ./test_write.sh
./test_write_no_same.sh
./test_heal.sh
SCHEME=ec PG_MINSIZE=2 ./test_heal.sh

View File

@@ -43,7 +43,7 @@ kill_osds &
LD_PRELOAD="build/src/libfio_vitastor.so" \
fio -thread -name=test -ioengine=build/src/libfio_vitastor.so -bs=4k -direct=1 -iodepth=16 -fsync=256 -rw=randwrite \
-mirror_file=./testdata/mirror.bin -etcd=$ETCD_URL -image=testimg -loops=10 -runtime=120
-mirror_file=./testdata/mirror.bin -etcd=$ETCD_URL -image=testimg -loops=10 -runtime=120 2>/dev/null
qemu-img convert -S 4096 -p \
-f raw "vitastor:etcd_host=127.0.0.1\:$ETCD_PORT/v3:image=testimg" \

View File

@@ -7,7 +7,7 @@ OSD_COUNT=5
OSD_ARGS=
for i in $(seq 1 $OSD_COUNT); do
dd if=/dev/zero of=./testdata/test_osd$i.bin bs=1024 count=1 seek=$((OSD_SIZE*1024-1))
build/src/vitastor-osd --osd_num $i --bind_address 127.0.0.1 $OSD_ARGS --etcd_address $ETCD_URL $(build/src/vitastor-disk simple-offsets --format options ./testdata/test_osd$i.bin 2>/dev/null) >>./testdata/osd$i.log 2>&1 &
build/src/vitastor-osd --osd_num $i --bind_address 127.0.0.1 $OSD_ARGS --etcd_address $ETCD_URL $(build/src/vitastor-disk simple-offsets --format options ./testdata/test_osd$i.bin 2>/dev/null) &>./testdata/osd$i.log &
eval OSD${i}_PID=$!
done

View File

@@ -53,7 +53,7 @@ for i in $(seq 1 $OSD_COUNT); do
--data_device ./testdata/test_osd$i.bin \
--meta_offset 0 \
--journal_offset $((1024*1024)) \
--data_offset $((128*1024*1024)) >>./testdata/osd$i.log 2>&1 &
--data_offset $((128*1024*1024)) &>./testdata/osd$i.log &
eval OSD${i}_PID=$!
done

View File

@@ -21,8 +21,7 @@ LD_PRELOAD="build/src/libfio_vitastor.so" \
# Kill OSD 2, start OSD 1
kill $OSD2_PID
build/src/vitastor-osd --osd_num 1 --bind_address 127.0.0.1 $OSD_ARGS --etcd_address $ETCD_URL \
$(build/src/vitastor-disk simple-offsets --format options --device ./testdata/test_osd2.bin 2>/dev/null) >>./testdata/osd2.log 2>&1 &
build/src/vitastor-osd --osd_num 1 --bind_address 127.0.0.1 $OSD_ARGS --etcd_address $ETCD_URL $(build/src/vitastor-disk simple-offsets --format options --device ./testdata/test_osd2.bin 2>/dev/null) >>./testdata/osd2.log 2>&1 &
sleep 2
# Check PG state - it should NOT become active

View File

@@ -10,7 +10,7 @@ etcdctl --endpoints=http://127.0.0.1:12379/v3 del --prefix /vitastor/osd/state
OSD_COUNT=3
OSD_ARGS=
for i in $(seq 1 $OSD_COUNT); do
build/src/vitastor-osd --osd_num $i --bind_address 127.0.0.1 $OSD_ARGS --etcd_address $ETCD_URL $(build/src/vitastor-disk simple-offsets --format options ./testdata/test_osd$i.bin 2>/dev/null) >>./testdata/osd$i.log 2>&1 &
build/src/vitastor-osd --osd_num $i --bind_address 127.0.0.1 $OSD_ARGS --etcd_address $ETCD_URL $(build/src/vitastor-disk simple-offsets --format options ./testdata/test_osd$i.bin 2>/dev/null) &>./testdata/osd$i.log &
eval OSD${i}_PID=$!
done