Compare commits
44 Commits
Author | SHA1 | Date |
---|---|---|
Vitaliy Filippov | 23a9aa93b5 | |
Vitaliy Filippov | 2412d9e239 | |
Vitaliy Filippov | 9301c857b1 | |
Vitaliy Filippov | 3094358ec2 | |
Vitaliy Filippov | 87f666d2a2 | |
Vitaliy Filippov | bd7fe4ef8f | |
Vitaliy Filippov | 1b3f9a1416 | |
Vitaliy Filippov | a7b7354f38 | |
Vitaliy Filippov | 765befa22f | |
Vitaliy Filippov | 87b3ab94fe | |
Vitaliy Filippov | 2c0801f6e4 | |
Vitaliy Filippov | fd83fef1d9 | |
Vitaliy Filippov | 8d1067971b | |
Vitaliy Filippov | ae5af04fde | |
Vitaliy Filippov | 266d038b11 | |
Vitaliy Filippov | ff4414d37e | |
Vitaliy Filippov | 0fa7ecc03f | |
Vitaliy Filippov | c29bfe12eb | |
Vitaliy Filippov | 57bf84ddb2 | |
Vitaliy Filippov | dff4879c8c | |
Vitaliy Filippov | af9a853db6 | |
Vitaliy Filippov | b7a3275af3 | |
Vitaliy Filippov | 64c5c4ca26 | |
idelson | 442a9d838d | |
Vitaliy Filippov | 6366972fe8 | |
Vitaliy Filippov | 2b863fb715 | |
Vitaliy Filippov | 3bf4dd5abd | |
Vitaliy Filippov | 3b84dcaedd | |
Vitaliy Filippov | 20fbc4a745 | |
Vitaliy Filippov | 02993ee1dd | |
Vitaliy Filippov | 3629dbc54d | |
Vitaliy Filippov | 29284bef40 | |
Vitaliy Filippov | 6a924d6066 | |
Vitaliy Filippov | 9fe779a691 | |
Vitaliy Filippov | 31c2751b9b | |
Vitaliy Filippov | c5195666cd | |
Vitaliy Filippov | f36d7eb76c | |
Vitaliy Filippov | dd7f651de1 | |
Vitaliy Filippov | a2994ecd0d | |
Vitaliy Filippov | 5d3aaf016b | |
Vitaliy Filippov | 0b097ca3f2 | |
Vitaliy Filippov | 989675a780 | |
Vitaliy Filippov | f8c403ec9e | |
Vitaliy Filippov | bfbb85e653 |
|
@ -64,6 +64,13 @@ jobs:
|
|||
# leak sanitizer sometimes crashes
|
||||
- run: cd /root/vitastor/build && ASAN_OPTIONS=detect_leaks=0 make -j16 test
|
||||
|
||||
npm_lint:
|
||||
runs-on: ubuntu-latest
|
||||
needs: build
|
||||
container: ${{env.TEST_IMAGE}}:${{github.sha}}
|
||||
steps:
|
||||
- run: cd /root/vitastor/mon && npm run lint
|
||||
|
||||
test_add_osd:
|
||||
runs-on: ubuntu-latest
|
||||
needs: build
|
||||
|
@ -532,6 +539,24 @@ jobs:
|
|||
echo ""
|
||||
done
|
||||
|
||||
test_root_node:
|
||||
runs-on: ubuntu-latest
|
||||
needs: build
|
||||
container: ${{env.TEST_IMAGE}}:${{github.sha}}
|
||||
steps:
|
||||
- name: Run test
|
||||
id: test
|
||||
timeout-minutes: 3
|
||||
run: /root/vitastor/tests/test_root_node.sh
|
||||
- name: Print logs
|
||||
if: always() && steps.test.outcome == 'failure'
|
||||
run: |
|
||||
for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
|
||||
echo "-------- $i --------"
|
||||
cat $i
|
||||
echo ""
|
||||
done
|
||||
|
||||
test_switch_primary:
|
||||
runs-on: ubuntu-latest
|
||||
needs: build
|
||||
|
@ -748,6 +773,96 @@ jobs:
|
|||
echo ""
|
||||
done
|
||||
|
||||
test_osd_tags:
|
||||
runs-on: ubuntu-latest
|
||||
needs: build
|
||||
container: ${{env.TEST_IMAGE}}:${{github.sha}}
|
||||
steps:
|
||||
- name: Run test
|
||||
id: test
|
||||
timeout-minutes: 3
|
||||
run: /root/vitastor/tests/test_osd_tags.sh
|
||||
- name: Print logs
|
||||
if: always() && steps.test.outcome == 'failure'
|
||||
run: |
|
||||
for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
|
||||
echo "-------- $i --------"
|
||||
cat $i
|
||||
echo ""
|
||||
done
|
||||
|
||||
test_enospc:
|
||||
runs-on: ubuntu-latest
|
||||
needs: build
|
||||
container: ${{env.TEST_IMAGE}}:${{github.sha}}
|
||||
steps:
|
||||
- name: Run test
|
||||
id: test
|
||||
timeout-minutes: 3
|
||||
run: /root/vitastor/tests/test_enospc.sh
|
||||
- name: Print logs
|
||||
if: always() && steps.test.outcome == 'failure'
|
||||
run: |
|
||||
for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
|
||||
echo "-------- $i --------"
|
||||
cat $i
|
||||
echo ""
|
||||
done
|
||||
|
||||
test_enospc_xor:
|
||||
runs-on: ubuntu-latest
|
||||
needs: build
|
||||
container: ${{env.TEST_IMAGE}}:${{github.sha}}
|
||||
steps:
|
||||
- name: Run test
|
||||
id: test
|
||||
timeout-minutes: 3
|
||||
run: SCHEME=xor /root/vitastor/tests/test_enospc.sh
|
||||
- name: Print logs
|
||||
if: always() && steps.test.outcome == 'failure'
|
||||
run: |
|
||||
for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
|
||||
echo "-------- $i --------"
|
||||
cat $i
|
||||
echo ""
|
||||
done
|
||||
|
||||
test_enospc_imm:
|
||||
runs-on: ubuntu-latest
|
||||
needs: build
|
||||
container: ${{env.TEST_IMAGE}}:${{github.sha}}
|
||||
steps:
|
||||
- name: Run test
|
||||
id: test
|
||||
timeout-minutes: 3
|
||||
run: IMMEDIATE_COMMIT=1 /root/vitastor/tests/test_enospc.sh
|
||||
- name: Print logs
|
||||
if: always() && steps.test.outcome == 'failure'
|
||||
run: |
|
||||
for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
|
||||
echo "-------- $i --------"
|
||||
cat $i
|
||||
echo ""
|
||||
done
|
||||
|
||||
test_enospc_imm_xor:
|
||||
runs-on: ubuntu-latest
|
||||
needs: build
|
||||
container: ${{env.TEST_IMAGE}}:${{github.sha}}
|
||||
steps:
|
||||
- name: Run test
|
||||
id: test
|
||||
timeout-minutes: 3
|
||||
run: IMMEDIATE_COMMIT=1 SCHEME=xor /root/vitastor/tests/test_enospc.sh
|
||||
- name: Print logs
|
||||
if: always() && steps.test.outcome == 'failure'
|
||||
run: |
|
||||
for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
|
||||
echo "-------- $i --------"
|
||||
cat $i
|
||||
echo ""
|
||||
done
|
||||
|
||||
test_scrub:
|
||||
runs-on: ubuntu-latest
|
||||
needs: build
|
||||
|
|
|
@ -2,6 +2,6 @@ cmake_minimum_required(VERSION 2.8.12)
|
|||
|
||||
project(vitastor)
|
||||
|
||||
set(VERSION "1.5.0")
|
||||
set(VERSION "1.6.1")
|
||||
|
||||
add_subdirectory(src)
|
||||
|
|
|
@ -64,10 +64,12 @@ Vitastor поддерживает QEMU-драйвер, протоколы NBD и
|
|||
- [NBD](docs/usage/nbd.ru.md) для монтирования ядром
|
||||
- [QEMU и qemu-img](docs/usage/qemu.ru.md)
|
||||
- [NFS](docs/usage/nfs.ru.md) кластерная файловая система и псевдо-ФС прокси
|
||||
- [Администрирование](docs/usage/admin.ru.md)
|
||||
- Производительность
|
||||
- [Понимание сути производительности](docs/performance/understanding.ru.md)
|
||||
- [Теоретический максимум](docs/performance/theoretical.ru.md)
|
||||
- [Пример сравнения с Ceph](docs/performance/comparison1.ru.md)
|
||||
- [Более новый тест Vitastor 1.3.1](docs/performance/bench2.ru.md)
|
||||
|
||||
## Автор и лицензия
|
||||
|
||||
|
|
|
@ -64,10 +64,12 @@ Read more details below in the documentation.
|
|||
- [NBD](docs/usage/nbd.en.md) for kernel mounts
|
||||
- [QEMU and qemu-img](docs/usage/qemu.en.md)
|
||||
- [NFS](docs/usage/nfs.en.md) clustered file system and pseudo-FS proxy
|
||||
- [Administration](docs/usage/admin.en.md)
|
||||
- Performance
|
||||
- [Understanding storage performance](docs/performance/understanding.en.md)
|
||||
- [Theoretical performance](docs/performance/theoretical.en.md)
|
||||
- [Example comparison with Ceph](docs/performance/comparison1.en.md)
|
||||
- [Newer benchmark of Vitastor 1.3.1](docs/performance/bench2.en.md)
|
||||
|
||||
## Author and License
|
||||
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
VERSION ?= v1.5.0
|
||||
VERSION ?= v1.6.1
|
||||
|
||||
all: build push
|
||||
|
||||
|
|
|
@ -49,7 +49,7 @@ spec:
|
|||
capabilities:
|
||||
add: ["SYS_ADMIN"]
|
||||
allowPrivilegeEscalation: true
|
||||
image: vitalif/vitastor-csi:v1.5.0
|
||||
image: vitalif/vitastor-csi:v1.6.1
|
||||
args:
|
||||
- "--node=$(NODE_ID)"
|
||||
- "--endpoint=$(CSI_ENDPOINT)"
|
||||
|
|
|
@ -121,7 +121,7 @@ spec:
|
|||
privileged: true
|
||||
capabilities:
|
||||
add: ["SYS_ADMIN"]
|
||||
image: vitalif/vitastor-csi:v1.5.0
|
||||
image: vitalif/vitastor-csi:v1.6.1
|
||||
args:
|
||||
- "--node=$(NODE_ID)"
|
||||
- "--endpoint=$(CSI_ENDPOINT)"
|
||||
|
|
|
@ -5,7 +5,7 @@ package vitastor
|
|||
|
||||
const (
|
||||
vitastorCSIDriverName = "csi.vitastor.io"
|
||||
vitastorCSIDriverVersion = "1.5.0"
|
||||
vitastorCSIDriverVersion = "1.6.1"
|
||||
)
|
||||
|
||||
// Config struct fills the parameters of request or user input
|
||||
|
|
|
@ -5,14 +5,13 @@ package vitastor
|
|||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"strconv"
|
||||
"strings"
|
||||
"sync"
|
||||
"syscall"
|
||||
"time"
|
||||
|
||||
|
@ -34,6 +33,9 @@ type NodeServer struct
|
|||
stateDir string
|
||||
mounter mount.Interface
|
||||
restartInterval time.Duration
|
||||
mu sync.Mutex
|
||||
cond *sync.Cond
|
||||
volumeLocks map[string]bool
|
||||
}
|
||||
|
||||
type DeviceState struct
|
||||
|
@ -63,7 +65,9 @@ func NewNodeServer(driver *Driver) *NodeServer
|
|||
useVduse: checkVduseSupport(),
|
||||
stateDir: stateDir,
|
||||
mounter: mount.New(""),
|
||||
volumeLocks: make(map[string]bool),
|
||||
}
|
||||
ns.cond = sync.NewCond(&ns.mu)
|
||||
if (ns.useVduse)
|
||||
{
|
||||
ns.restoreVduseDaemons()
|
||||
|
@ -81,299 +85,24 @@ func NewNodeServer(driver *Driver) *NodeServer
|
|||
return ns
|
||||
}
|
||||
|
||||
func checkVduseSupport() bool
|
||||
func (ns *NodeServer) lockVolume(lockId string)
|
||||
{
|
||||
// Check VDUSE support (vdpa, vduse, virtio-vdpa kernel modules)
|
||||
vduse := true
|
||||
for _, mod := range []string{"vdpa", "vduse", "virtio-vdpa"}
|
||||
ns.mu.Lock()
|
||||
defer ns.mu.Unlock()
|
||||
for (ns.volumeLocks[lockId])
|
||||
{
|
||||
_, err := os.Stat("/sys/module/"+mod)
|
||||
if (err != nil)
|
||||
{
|
||||
if (!errors.Is(err, os.ErrNotExist))
|
||||
{
|
||||
klog.Errorf("failed to check /sys/module/%s: %v", mod, err)
|
||||
}
|
||||
c := exec.Command("/sbin/modprobe", mod)
|
||||
c.Stdout = os.Stderr
|
||||
c.Stderr = os.Stderr
|
||||
err := c.Run()
|
||||
if (err != nil)
|
||||
{
|
||||
klog.Errorf("/sbin/modprobe %s failed: %v", mod, err)
|
||||
vduse = false
|
||||
break
|
||||
}
|
||||
}
|
||||
ns.cond.Wait()
|
||||
}
|
||||
// Check that vdpa tool functions
|
||||
if (vduse)
|
||||
{
|
||||
c := exec.Command("/sbin/vdpa", "-j", "dev")
|
||||
c.Stderr = os.Stderr
|
||||
err := c.Run()
|
||||
if (err != nil)
|
||||
{
|
||||
klog.Errorf("/sbin/vdpa -j dev failed: %v", err)
|
||||
vduse = false
|
||||
}
|
||||
}
|
||||
if (!vduse)
|
||||
{
|
||||
klog.Errorf(
|
||||
"Your host apparently has no VDUSE support. VDUSE support disabled, NBD will be used to map devices."+
|
||||
" For VDUSE you need at least Linux 5.15 and the following kernel modules: vdpa, virtio-vdpa, vduse.",
|
||||
)
|
||||
}
|
||||
return vduse
|
||||
ns.volumeLocks[lockId] = true
|
||||
ns.cond.Broadcast()
|
||||
}
|
||||
|
||||
// NodeStageVolume mounts the volume to a staging path on the node.
|
||||
func (ns *NodeServer) NodeStageVolume(ctx context.Context, req *csi.NodeStageVolumeRequest) (*csi.NodeStageVolumeResponse, error)
|
||||
func (ns *NodeServer) unlockVolume(lockId string)
|
||||
{
|
||||
return &csi.NodeStageVolumeResponse{}, nil
|
||||
}
|
||||
|
||||
// NodeUnstageVolume unstages the volume from the staging path
|
||||
func (ns *NodeServer) NodeUnstageVolume(ctx context.Context, req *csi.NodeUnstageVolumeRequest) (*csi.NodeUnstageVolumeResponse, error)
|
||||
{
|
||||
return &csi.NodeUnstageVolumeResponse{}, nil
|
||||
}
|
||||
|
||||
func Contains(list []string, s string) bool
|
||||
{
|
||||
for i := 0; i < len(list); i++
|
||||
{
|
||||
if (list[i] == s)
|
||||
{
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func (ns *NodeServer) mapNbd(volName string, ctxVars map[string]string, readonly bool) (string, error)
|
||||
{
|
||||
// Map NBD device
|
||||
// FIXME: Check if already mapped
|
||||
args := []string{
|
||||
"map", "--image", volName,
|
||||
}
|
||||
if (ctxVars["configPath"] != "")
|
||||
{
|
||||
args = append(args, "--config_path", ctxVars["configPath"])
|
||||
}
|
||||
if (readonly)
|
||||
{
|
||||
args = append(args, "--readonly", "1")
|
||||
}
|
||||
stdout, stderr, err := system("/usr/bin/vitastor-nbd", args...)
|
||||
dev := strings.TrimSpace(string(stdout))
|
||||
if (dev == "")
|
||||
{
|
||||
return "", fmt.Errorf("vitastor-nbd did not return the name of NBD device. output: %s", stderr)
|
||||
}
|
||||
return dev, err
|
||||
}
|
||||
|
||||
func (ns *NodeServer) unmapNbd(devicePath string)
|
||||
{
|
||||
// unmap NBD device
|
||||
unmapOut, unmapErr := exec.Command("/usr/bin/vitastor-nbd", "unmap", devicePath).CombinedOutput()
|
||||
if (unmapErr != nil)
|
||||
{
|
||||
klog.Errorf("failed to unmap NBD device %s: %s, error: %v", devicePath, unmapOut, unmapErr)
|
||||
}
|
||||
}
|
||||
|
||||
func findByPidFile(pidFile string) (*os.Process, error)
|
||||
{
|
||||
pidBuf, err := os.ReadFile(pidFile)
|
||||
if (err != nil)
|
||||
{
|
||||
return nil, err
|
||||
}
|
||||
pid, err := strconv.ParseInt(strings.TrimSpace(string(pidBuf)), 0, 64)
|
||||
if (err != nil)
|
||||
{
|
||||
return nil, err
|
||||
}
|
||||
proc, err := os.FindProcess(int(pid))
|
||||
if (err != nil)
|
||||
{
|
||||
return nil, err
|
||||
}
|
||||
return proc, nil
|
||||
}
|
||||
|
||||
func killByPidFile(pidFile string) error
|
||||
{
|
||||
klog.Infof("killing process with PID from file %s", pidFile)
|
||||
proc, err := findByPidFile(pidFile)
|
||||
if (err != nil)
|
||||
{
|
||||
return err
|
||||
}
|
||||
return proc.Signal(syscall.SIGTERM)
|
||||
}
|
||||
|
||||
func startStorageDaemon(vdpaId, volName, pidFile, configPath string, readonly bool) error
|
||||
{
|
||||
// Start qemu-storage-daemon
|
||||
blockSpec := map[string]interface{}{
|
||||
"node-name": "disk1",
|
||||
"driver": "vitastor",
|
||||
"image": volName,
|
||||
"cache": map[string]bool{
|
||||
"direct": true,
|
||||
"no-flush": false,
|
||||
},
|
||||
"discard": "unmap",
|
||||
}
|
||||
if (configPath != "")
|
||||
{
|
||||
blockSpec["config-path"] = configPath
|
||||
}
|
||||
blockSpecJson, _ := json.Marshal(blockSpec)
|
||||
writable := "true"
|
||||
if (readonly)
|
||||
{
|
||||
writable = "false"
|
||||
}
|
||||
_, _, err := system(
|
||||
"/usr/bin/qemu-storage-daemon", "--daemonize", "--pidfile", pidFile, "--blockdev", string(blockSpecJson),
|
||||
"--export", "vduse-blk,id="+vdpaId+",node-name=disk1,name="+vdpaId+",num-queues=16,queue-size=128,writable="+writable,
|
||||
)
|
||||
return err
|
||||
}
|
||||
|
||||
func (ns *NodeServer) mapVduse(volName string, ctxVars map[string]string, readonly bool) (string, string, error)
|
||||
{
|
||||
// Generate state file
|
||||
stateFd, err := os.CreateTemp(ns.stateDir, "vitastor-vduse-*.json")
|
||||
if (err != nil)
|
||||
{
|
||||
return "", "", err
|
||||
}
|
||||
stateFile := stateFd.Name()
|
||||
stateFd.Close()
|
||||
vdpaId := filepath.Base(stateFile)
|
||||
vdpaId = vdpaId[0:len(vdpaId)-5] // remove ".json"
|
||||
pidFile := ns.stateDir + vdpaId + ".pid"
|
||||
// Map VDUSE device via qemu-storage-daemon
|
||||
err = startStorageDaemon(vdpaId, volName, pidFile, ctxVars["configPath"], readonly)
|
||||
if (err == nil)
|
||||
{
|
||||
// Add device to VDPA bus
|
||||
_, _, err = system("/sbin/vdpa", "-j", "dev", "add", "name", vdpaId, "mgmtdev", "vduse")
|
||||
if (err == nil)
|
||||
{
|
||||
// Find block device name
|
||||
var matches []string
|
||||
matches, err = filepath.Glob("/sys/bus/vdpa/devices/"+vdpaId+"/virtio*/block/*")
|
||||
if (err == nil && len(matches) == 0)
|
||||
{
|
||||
err = errors.New("/sys/bus/vdpa/devices/"+vdpaId+"/virtio*/block/* is not found")
|
||||
}
|
||||
if (err == nil)
|
||||
{
|
||||
blockdev := "/dev/"+filepath.Base(matches[0])
|
||||
_, err = os.Stat(blockdev)
|
||||
if (err == nil)
|
||||
{
|
||||
// Generate state file
|
||||
stateJSON, _ := json.Marshal(&DeviceState{
|
||||
ConfigPath: ctxVars["configPath"],
|
||||
VdpaId: vdpaId,
|
||||
Image: volName,
|
||||
Blockdev: blockdev,
|
||||
Readonly: readonly,
|
||||
PidFile: pidFile,
|
||||
})
|
||||
err = os.WriteFile(stateFile, stateJSON, 0600)
|
||||
if (err == nil)
|
||||
{
|
||||
return blockdev, vdpaId, nil
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
killErr := killByPidFile(pidFile)
|
||||
if (killErr != nil)
|
||||
{
|
||||
klog.Errorf("Failed to kill started qemu-storage-daemon: %v", killErr)
|
||||
}
|
||||
os.Remove(stateFile)
|
||||
os.Remove(pidFile)
|
||||
}
|
||||
return "", "", err
|
||||
}
|
||||
|
||||
func (ns *NodeServer) unmapVduse(devicePath string)
|
||||
{
|
||||
if (len(devicePath) < 6 || devicePath[0:6] != "/dev/v")
|
||||
{
|
||||
klog.Errorf("%s does not start with /dev/v", devicePath)
|
||||
return
|
||||
}
|
||||
vduseDev, err := os.Readlink("/sys/block/"+devicePath[5:])
|
||||
if (err != nil)
|
||||
{
|
||||
klog.Errorf("%s is not a symbolic link to VDUSE device (../devices/virtual/vduse/xxx): %v", devicePath, err)
|
||||
return
|
||||
}
|
||||
vdpaId := ""
|
||||
p := strings.Index(vduseDev, "/vduse/")
|
||||
if (p >= 0)
|
||||
{
|
||||
vduseDev = vduseDev[p+7:]
|
||||
p = strings.Index(vduseDev, "/")
|
||||
if (p >= 0)
|
||||
{
|
||||
vdpaId = vduseDev[0:p]
|
||||
}
|
||||
}
|
||||
if (vdpaId == "")
|
||||
{
|
||||
klog.Errorf("%s is not a symbolic link to VDUSE device (../devices/virtual/vduse/xxx), but is %v", devicePath, vduseDev)
|
||||
return
|
||||
}
|
||||
ns.unmapVduseById(vdpaId)
|
||||
}
|
||||
|
||||
func (ns *NodeServer) unmapVduseById(vdpaId string)
|
||||
{
|
||||
_, err := os.Stat("/sys/bus/vdpa/devices/"+vdpaId)
|
||||
if (err != nil)
|
||||
{
|
||||
klog.Errorf("failed to stat /sys/bus/vdpa/devices/"+vdpaId+": %v", err)
|
||||
}
|
||||
else
|
||||
{
|
||||
_, _, _ = system("/sbin/vdpa", "-j", "dev", "del", vdpaId)
|
||||
}
|
||||
stateFile := ns.stateDir + vdpaId + ".json"
|
||||
os.Remove(stateFile)
|
||||
pidFile := ns.stateDir + vdpaId + ".pid"
|
||||
_, err = os.Stat(pidFile)
|
||||
if (os.IsNotExist(err))
|
||||
{
|
||||
// ok, already killed
|
||||
}
|
||||
else if (err != nil)
|
||||
{
|
||||
klog.Errorf("Failed to stat %v: %v", pidFile, err)
|
||||
return
|
||||
}
|
||||
else
|
||||
{
|
||||
err = killByPidFile(pidFile)
|
||||
if (err != nil)
|
||||
{
|
||||
klog.Errorf("Failed to kill started qemu-storage-daemon: %v", err)
|
||||
}
|
||||
os.Remove(pidFile)
|
||||
}
|
||||
ns.mu.Lock()
|
||||
defer ns.mu.Unlock()
|
||||
delete(ns.volumeLocks, lockId)
|
||||
ns.cond.Broadcast()
|
||||
}
|
||||
|
||||
func (ns *NodeServer) restarter()
|
||||
|
@ -422,58 +151,83 @@ func (ns *NodeServer) restoreVduseDaemons()
|
|||
vdpaId := filepath.Base(stateFile)
|
||||
vdpaId = vdpaId[0:len(vdpaId)-5]
|
||||
// Check if VDPA device is still added to the bus
|
||||
if (devs[vdpaId] != nil)
|
||||
{
|
||||
// Check if the storage daemon is still active
|
||||
pidFile := ns.stateDir + vdpaId + ".pid"
|
||||
exists := false
|
||||
proc, err := findByPidFile(pidFile)
|
||||
if (err == nil)
|
||||
{
|
||||
exists = proc.Signal(syscall.Signal(0)) == nil
|
||||
}
|
||||
if (!exists)
|
||||
{
|
||||
// Restart daemon
|
||||
stateJSON, err := os.ReadFile(stateFile)
|
||||
if (err != nil)
|
||||
{
|
||||
klog.Warningf("error reading state file %v: %v", stateFile, err)
|
||||
}
|
||||
else
|
||||
{
|
||||
var state DeviceState
|
||||
err := json.Unmarshal(stateJSON, &state)
|
||||
if (err != nil)
|
||||
{
|
||||
klog.Warningf("state file %v contains invalid JSON (error %v): %v", stateFile, err, string(stateJSON))
|
||||
}
|
||||
else
|
||||
{
|
||||
klog.Warningf("restarting storage daemon for volume %v (VDPA ID %v)", state.Image, vdpaId)
|
||||
_ = startStorageDaemon(vdpaId, state.Image, pidFile, state.ConfigPath, state.Readonly)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
if (devs[vdpaId] == nil)
|
||||
{
|
||||
// Unused, clean it up
|
||||
ns.unmapVduseById(vdpaId)
|
||||
unmapVduseById(ns.stateDir, vdpaId)
|
||||
continue
|
||||
}
|
||||
|
||||
stateJSON, err := os.ReadFile(stateFile)
|
||||
if (err != nil)
|
||||
{
|
||||
klog.Warningf("error reading state file %v: %v", stateFile, err)
|
||||
continue
|
||||
}
|
||||
var state DeviceState
|
||||
err = json.Unmarshal(stateJSON, &state)
|
||||
if (err != nil)
|
||||
{
|
||||
klog.Warningf("state file %v contains invalid JSON (error %v): %v", stateFile, err, string(stateJSON))
|
||||
continue
|
||||
}
|
||||
|
||||
ns.lockVolume(state.ConfigPath+":"+state.Image)
|
||||
|
||||
// Recheck state file after locking
|
||||
_, err = os.ReadFile(stateFile)
|
||||
if (err != nil)
|
||||
{
|
||||
klog.Warningf("state file %v disappeared, skipping volume", stateFile)
|
||||
ns.unlockVolume(state.ConfigPath+":"+state.Image)
|
||||
continue
|
||||
}
|
||||
|
||||
// Check if the storage daemon is still active
|
||||
pidFile := ns.stateDir + vdpaId + ".pid"
|
||||
exists := false
|
||||
proc, err := findByPidFile(pidFile)
|
||||
if (err == nil)
|
||||
{
|
||||
exists = proc.Signal(syscall.Signal(0)) == nil
|
||||
}
|
||||
if (!exists)
|
||||
{
|
||||
// Restart daemon
|
||||
klog.Warningf("restarting storage daemon for volume %v (VDPA ID %v)", state.Image, vdpaId)
|
||||
_ = startStorageDaemon(vdpaId, state.Image, pidFile, state.ConfigPath, state.Readonly)
|
||||
}
|
||||
|
||||
ns.unlockVolume(state.ConfigPath+":"+state.Image)
|
||||
}
|
||||
}
|
||||
|
||||
// NodePublishVolume mounts the volume mounted to the staging path to the target path
|
||||
func (ns *NodeServer) NodePublishVolume(ctx context.Context, req *csi.NodePublishVolumeRequest) (*csi.NodePublishVolumeResponse, error)
|
||||
// NodeStageVolume mounts the volume to a staging path on the node.
|
||||
func (ns *NodeServer) NodeStageVolume(ctx context.Context, req *csi.NodeStageVolumeRequest) (*csi.NodeStageVolumeResponse, error)
|
||||
{
|
||||
klog.Infof("received node publish volume request %+v", protosanitizer.StripSecrets(req))
|
||||
klog.Infof("received node stage volume request %+v", protosanitizer.StripSecrets(req))
|
||||
|
||||
targetPath := req.GetTargetPath()
|
||||
ctxVars := make(map[string]string)
|
||||
err := json.Unmarshal([]byte(req.VolumeId), &ctxVars)
|
||||
if (err != nil)
|
||||
{
|
||||
return nil, status.Error(codes.Internal, "volume ID not in JSON format")
|
||||
}
|
||||
_, err = GetConnectionParams(ctxVars)
|
||||
if (err != nil)
|
||||
{
|
||||
return nil, err
|
||||
}
|
||||
volName := ctxVars["name"]
|
||||
|
||||
ns.lockVolume(ctxVars["configPath"]+":"+volName)
|
||||
defer ns.unlockVolume(ctxVars["configPath"]+":"+volName)
|
||||
|
||||
targetPath := req.GetStagingTargetPath()
|
||||
isBlock := req.GetVolumeCapability().GetBlock() != nil
|
||||
|
||||
// Check that it's not already mounted
|
||||
_, err := mount.IsNotMountPoint(ns.mounter, targetPath)
|
||||
_, err = mount.IsNotMountPoint(ns.mounter, targetPath)
|
||||
if (err != nil)
|
||||
{
|
||||
if (os.IsNotExist(err))
|
||||
|
@ -509,28 +263,14 @@ func (ns *NodeServer) NodePublishVolume(ctx context.Context, req *csi.NodePublis
|
|||
}
|
||||
}
|
||||
|
||||
ctxVars := make(map[string]string)
|
||||
err = json.Unmarshal([]byte(req.VolumeId), &ctxVars)
|
||||
if (err != nil)
|
||||
{
|
||||
return nil, status.Error(codes.Internal, "volume ID not in JSON format")
|
||||
}
|
||||
volName := ctxVars["name"]
|
||||
|
||||
_, err = GetConnectionParams(ctxVars)
|
||||
if (err != nil)
|
||||
{
|
||||
return nil, err
|
||||
}
|
||||
|
||||
var devicePath, vdpaId string
|
||||
if (!ns.useVduse)
|
||||
{
|
||||
devicePath, err = ns.mapNbd(volName, ctxVars, req.GetReadonly())
|
||||
devicePath, err = mapNbd(volName, ctxVars, false)
|
||||
}
|
||||
else
|
||||
{
|
||||
devicePath, vdpaId, err = ns.mapVduse(volName, ctxVars, req.GetReadonly())
|
||||
devicePath, vdpaId, err = mapVduse(ns.stateDir, volName, ctxVars, false)
|
||||
}
|
||||
if (err != nil)
|
||||
{
|
||||
|
@ -614,26 +354,182 @@ func (ns *NodeServer) NodePublishVolume(ctx context.Context, req *csi.NodePublis
|
|||
)
|
||||
goto unmap
|
||||
}
|
||||
return &csi.NodePublishVolumeResponse{}, nil
|
||||
return &csi.NodeStageVolumeResponse{}, nil
|
||||
|
||||
unmap:
|
||||
if (!ns.useVduse || len(devicePath) >= 8 && devicePath[0:8] == "/dev/nbd")
|
||||
{
|
||||
ns.unmapNbd(devicePath)
|
||||
unmapNbd(devicePath)
|
||||
}
|
||||
else
|
||||
{
|
||||
ns.unmapVduseById(vdpaId)
|
||||
unmapVduseById(ns.stateDir, vdpaId)
|
||||
}
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// NodeUnstageVolume unstages the volume from the staging path
|
||||
func (ns *NodeServer) NodeUnstageVolume(ctx context.Context, req *csi.NodeUnstageVolumeRequest) (*csi.NodeUnstageVolumeResponse, error)
|
||||
{
|
||||
klog.Infof("received node unstage volume request %+v", protosanitizer.StripSecrets(req))
|
||||
|
||||
ctxVars := make(map[string]string)
|
||||
err := json.Unmarshal([]byte(req.VolumeId), &ctxVars)
|
||||
if (err != nil)
|
||||
{
|
||||
return nil, status.Error(codes.Internal, "volume ID not in JSON format")
|
||||
}
|
||||
volName := ctxVars["name"]
|
||||
|
||||
ns.lockVolume(ctxVars["configPath"]+":"+volName)
|
||||
defer ns.unlockVolume(ctxVars["configPath"]+":"+volName)
|
||||
|
||||
targetPath := req.GetStagingTargetPath()
|
||||
devicePath, refCount, err := mount.GetDeviceNameFromMount(ns.mounter, targetPath)
|
||||
if (err != nil)
|
||||
{
|
||||
if (os.IsNotExist(err))
|
||||
{
|
||||
return nil, status.Error(codes.NotFound, "Target path not found")
|
||||
}
|
||||
return nil, err
|
||||
}
|
||||
if (devicePath == "")
|
||||
{
|
||||
// volume not mounted
|
||||
klog.Warningf("%s is not a mountpoint, deleting", targetPath)
|
||||
os.Remove(targetPath)
|
||||
return &csi.NodeUnstageVolumeResponse{}, nil
|
||||
}
|
||||
|
||||
// unmount
|
||||
err = mount.CleanupMountPoint(targetPath, ns.mounter, false)
|
||||
if (err != nil)
|
||||
{
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// unmap device
|
||||
if (refCount == 1)
|
||||
{
|
||||
if (!ns.useVduse)
|
||||
{
|
||||
unmapNbd(devicePath)
|
||||
}
|
||||
else
|
||||
{
|
||||
unmapVduse(ns.stateDir, devicePath)
|
||||
}
|
||||
}
|
||||
|
||||
return &csi.NodeUnstageVolumeResponse{}, nil
|
||||
}
|
||||
|
||||
// NodePublishVolume mounts the volume mounted to the staging path to the target path
|
||||
func (ns *NodeServer) NodePublishVolume(ctx context.Context, req *csi.NodePublishVolumeRequest) (*csi.NodePublishVolumeResponse, error)
|
||||
{
|
||||
klog.Infof("received node publish volume request %+v", protosanitizer.StripSecrets(req))
|
||||
|
||||
ctxVars := make(map[string]string)
|
||||
err := json.Unmarshal([]byte(req.VolumeId), &ctxVars)
|
||||
if (err != nil)
|
||||
{
|
||||
return nil, status.Error(codes.Internal, "volume ID not in JSON format")
|
||||
}
|
||||
_, err = GetConnectionParams(ctxVars)
|
||||
if (err != nil)
|
||||
{
|
||||
return nil, err
|
||||
}
|
||||
volName := ctxVars["name"]
|
||||
|
||||
ns.lockVolume(ctxVars["configPath"]+":"+volName)
|
||||
defer ns.unlockVolume(ctxVars["configPath"]+":"+volName)
|
||||
|
||||
stagingTargetPath := req.GetStagingTargetPath()
|
||||
targetPath := req.GetTargetPath()
|
||||
isBlock := req.GetVolumeCapability().GetBlock() != nil
|
||||
|
||||
// Check that stagingTargetPath is mounted
|
||||
_, err = mount.IsNotMountPoint(ns.mounter, stagingTargetPath)
|
||||
if (err != nil)
|
||||
{
|
||||
klog.Errorf("staging path %v is not mounted: %v", stagingTargetPath, err)
|
||||
return nil, fmt.Errorf("staging path %v is not mounted: %v", stagingTargetPath, err)
|
||||
}
|
||||
|
||||
// Check that targetPath is not already mounted
|
||||
_, err = mount.IsNotMountPoint(ns.mounter, targetPath)
|
||||
if (err != nil)
|
||||
{
|
||||
if (os.IsNotExist(err))
|
||||
{
|
||||
if (isBlock)
|
||||
{
|
||||
pathFile, err := os.OpenFile(targetPath, os.O_CREATE|os.O_RDWR, 0o600)
|
||||
if (err != nil)
|
||||
{
|
||||
klog.Errorf("failed to create block device mount target %s with error: %v", targetPath, err)
|
||||
return nil, err
|
||||
}
|
||||
err = pathFile.Close()
|
||||
if (err != nil)
|
||||
{
|
||||
klog.Errorf("failed to close %s with error: %v", targetPath, err)
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
err := os.MkdirAll(targetPath, 0777)
|
||||
if (err != nil)
|
||||
{
|
||||
klog.Errorf("failed to create fs mount target %s with error: %v", targetPath, err)
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
|
||||
execArgs := []string{"--bind", stagingTargetPath, targetPath}
|
||||
if (req.GetReadonly())
|
||||
{
|
||||
execArgs = append(execArgs, "-o", "ro")
|
||||
}
|
||||
cmd := exec.Command("mount", execArgs...)
|
||||
cmd.Stderr = os.Stderr
|
||||
klog.Infof("binding volume %v (%v) from %v to %v", volName, ctxVars["configPath"], stagingTargetPath, targetPath)
|
||||
out, err := cmd.Output()
|
||||
if (err != nil)
|
||||
{
|
||||
return nil, fmt.Errorf("Error running mount %v: %s", strings.Join(execArgs, " "), out)
|
||||
}
|
||||
|
||||
return &csi.NodePublishVolumeResponse{}, nil
|
||||
}
|
||||
|
||||
// NodeUnpublishVolume unmounts the volume from the target path
|
||||
func (ns *NodeServer) NodeUnpublishVolume(ctx context.Context, req *csi.NodeUnpublishVolumeRequest) (*csi.NodeUnpublishVolumeResponse, error)
|
||||
{
|
||||
klog.Infof("received node unpublish volume request %+v", protosanitizer.StripSecrets(req))
|
||||
|
||||
ctxVars := make(map[string]string)
|
||||
err := json.Unmarshal([]byte(req.VolumeId), &ctxVars)
|
||||
if (err != nil)
|
||||
{
|
||||
return nil, status.Error(codes.Internal, "volume ID not in JSON format")
|
||||
}
|
||||
volName := ctxVars["name"]
|
||||
|
||||
ns.lockVolume(ctxVars["configPath"]+":"+volName)
|
||||
defer ns.unlockVolume(ctxVars["configPath"]+":"+volName)
|
||||
|
||||
targetPath := req.GetTargetPath()
|
||||
devicePath, refCount, err := mount.GetDeviceNameFromMount(ns.mounter, targetPath)
|
||||
devicePath, _, err := mount.GetDeviceNameFromMount(ns.mounter, targetPath)
|
||||
if (err != nil)
|
||||
{
|
||||
if (os.IsNotExist(err))
|
||||
|
@ -649,24 +545,14 @@ func (ns *NodeServer) NodeUnpublishVolume(ctx context.Context, req *csi.NodeUnpu
|
|||
os.Remove(targetPath)
|
||||
return &csi.NodeUnpublishVolumeResponse{}, nil
|
||||
}
|
||||
|
||||
// unmount
|
||||
err = mount.CleanupMountPoint(targetPath, ns.mounter, false)
|
||||
if (err != nil)
|
||||
{
|
||||
return nil, err
|
||||
}
|
||||
// unmap NBD device
|
||||
if (refCount == 1)
|
||||
{
|
||||
if (!ns.useVduse)
|
||||
{
|
||||
ns.unmapNbd(devicePath)
|
||||
}
|
||||
else
|
||||
{
|
||||
ns.unmapVduse(devicePath)
|
||||
}
|
||||
}
|
||||
|
||||
return &csi.NodeUnpublishVolumeResponse{}, nil
|
||||
}
|
||||
|
||||
|
@ -685,7 +571,17 @@ func (ns *NodeServer) NodeExpandVolume(ctx context.Context, req *csi.NodeExpandV
|
|||
// NodeGetCapabilities returns the supported capabilities of the node server
|
||||
func (ns *NodeServer) NodeGetCapabilities(ctx context.Context, req *csi.NodeGetCapabilitiesRequest) (*csi.NodeGetCapabilitiesResponse, error)
|
||||
{
|
||||
return &csi.NodeGetCapabilitiesResponse{}, nil
|
||||
return &csi.NodeGetCapabilitiesResponse{
|
||||
Capabilities: []*csi.NodeServiceCapability{
|
||||
&csi.NodeServiceCapability{
|
||||
Type: &csi.NodeServiceCapability_Rpc{
|
||||
Rpc: &csi.NodeServiceCapability_RPC{
|
||||
Type: csi.NodeServiceCapability_RPC_STAGE_UNSTAGE_VOLUME,
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
}, nil
|
||||
}
|
||||
|
||||
// NodeGetInfo returns NodeGetInfoResponse for CO.
|
||||
|
|
|
@ -0,0 +1,301 @@
|
|||
// Copyright (c) Vitaliy Filippov, 2019+
|
||||
// License: VNPL-1.1 or GNU GPL-2.0+ (see README.md for details)
|
||||
|
||||
package vitastor
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"strconv"
|
||||
"strings"
|
||||
"syscall"
|
||||
|
||||
"k8s.io/klog"
|
||||
)
|
||||
|
||||
func Contains(list []string, s string) bool
|
||||
{
|
||||
for i := 0; i < len(list); i++
|
||||
{
|
||||
if (list[i] == s)
|
||||
{
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func checkVduseSupport() bool
|
||||
{
|
||||
// Check VDUSE support (vdpa, vduse, virtio-vdpa kernel modules)
|
||||
vduse := true
|
||||
for _, mod := range []string{"vdpa", "vduse", "virtio-vdpa"}
|
||||
{
|
||||
_, err := os.Stat("/sys/module/"+mod)
|
||||
if (err != nil)
|
||||
{
|
||||
if (!errors.Is(err, os.ErrNotExist))
|
||||
{
|
||||
klog.Errorf("failed to check /sys/module/%s: %v", mod, err)
|
||||
}
|
||||
c := exec.Command("/sbin/modprobe", mod)
|
||||
c.Stdout = os.Stderr
|
||||
c.Stderr = os.Stderr
|
||||
err := c.Run()
|
||||
if (err != nil)
|
||||
{
|
||||
klog.Errorf("/sbin/modprobe %s failed: %v", mod, err)
|
||||
vduse = false
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
// Check that vdpa tool functions
|
||||
if (vduse)
|
||||
{
|
||||
c := exec.Command("/sbin/vdpa", "-j", "dev")
|
||||
c.Stderr = os.Stderr
|
||||
err := c.Run()
|
||||
if (err != nil)
|
||||
{
|
||||
klog.Errorf("/sbin/vdpa -j dev failed: %v", err)
|
||||
vduse = false
|
||||
}
|
||||
}
|
||||
if (!vduse)
|
||||
{
|
||||
klog.Errorf(
|
||||
"Your host apparently has no VDUSE support. VDUSE support disabled, NBD will be used to map devices."+
|
||||
" For VDUSE you need at least Linux 5.15 and the following kernel modules: vdpa, virtio-vdpa, vduse.",
|
||||
)
|
||||
}
|
||||
return vduse
|
||||
}
|
||||
|
||||
func mapNbd(volName string, ctxVars map[string]string, readonly bool) (string, error)
|
||||
{
|
||||
// Map NBD device
|
||||
// FIXME: Check if already mapped
|
||||
args := []string{
|
||||
"map", "--image", volName,
|
||||
}
|
||||
if (ctxVars["configPath"] != "")
|
||||
{
|
||||
args = append(args, "--config_path", ctxVars["configPath"])
|
||||
}
|
||||
if (readonly)
|
||||
{
|
||||
args = append(args, "--readonly", "1")
|
||||
}
|
||||
stdout, stderr, err := system("/usr/bin/vitastor-nbd", args...)
|
||||
dev := strings.TrimSpace(string(stdout))
|
||||
if (dev == "")
|
||||
{
|
||||
return "", fmt.Errorf("vitastor-nbd did not return the name of NBD device. output: %s", stderr)
|
||||
}
|
||||
return dev, err
|
||||
}
|
||||
|
||||
func unmapNbd(devicePath string)
|
||||
{
|
||||
// unmap NBD device
|
||||
unmapOut, unmapErr := exec.Command("/usr/bin/vitastor-nbd", "unmap", devicePath).CombinedOutput()
|
||||
if (unmapErr != nil)
|
||||
{
|
||||
klog.Errorf("failed to unmap NBD device %s: %s, error: %v", devicePath, unmapOut, unmapErr)
|
||||
}
|
||||
}
|
||||
|
||||
func findByPidFile(pidFile string) (*os.Process, error)
|
||||
{
|
||||
pidBuf, err := os.ReadFile(pidFile)
|
||||
if (err != nil)
|
||||
{
|
||||
return nil, err
|
||||
}
|
||||
pid, err := strconv.ParseInt(strings.TrimSpace(string(pidBuf)), 0, 64)
|
||||
if (err != nil)
|
||||
{
|
||||
return nil, err
|
||||
}
|
||||
proc, err := os.FindProcess(int(pid))
|
||||
if (err != nil)
|
||||
{
|
||||
return nil, err
|
||||
}
|
||||
return proc, nil
|
||||
}
|
||||
|
||||
func killByPidFile(pidFile string) error
|
||||
{
|
||||
klog.Infof("killing process with PID from file %s", pidFile)
|
||||
proc, err := findByPidFile(pidFile)
|
||||
if (err != nil)
|
||||
{
|
||||
return err
|
||||
}
|
||||
return proc.Signal(syscall.SIGTERM)
|
||||
}
|
||||
|
||||
func startStorageDaemon(vdpaId, volName, pidFile, configPath string, readonly bool) error
|
||||
{
|
||||
// Start qemu-storage-daemon
|
||||
blockSpec := map[string]interface{}{
|
||||
"node-name": "disk1",
|
||||
"driver": "vitastor",
|
||||
"image": volName,
|
||||
"cache": map[string]bool{
|
||||
"direct": true,
|
||||
"no-flush": false,
|
||||
},
|
||||
"discard": "unmap",
|
||||
}
|
||||
if (configPath != "")
|
||||
{
|
||||
blockSpec["config-path"] = configPath
|
||||
}
|
||||
blockSpecJson, _ := json.Marshal(blockSpec)
|
||||
writable := "true"
|
||||
if (readonly)
|
||||
{
|
||||
writable = "false"
|
||||
}
|
||||
_, _, err := system(
|
||||
"/usr/bin/qemu-storage-daemon", "--daemonize", "--pidfile", pidFile, "--blockdev", string(blockSpecJson),
|
||||
"--export", "vduse-blk,id="+vdpaId+",node-name=disk1,name="+vdpaId+",num-queues=16,queue-size=128,writable="+writable,
|
||||
)
|
||||
return err
|
||||
}
|
||||
|
||||
func mapVduse(stateDir string, volName string, ctxVars map[string]string, readonly bool) (string, string, error)
|
||||
{
|
||||
// Generate state file
|
||||
stateFd, err := os.CreateTemp(stateDir, "vitastor-vduse-*.json")
|
||||
if (err != nil)
|
||||
{
|
||||
return "", "", err
|
||||
}
|
||||
stateFile := stateFd.Name()
|
||||
stateFd.Close()
|
||||
vdpaId := filepath.Base(stateFile)
|
||||
vdpaId = vdpaId[0:len(vdpaId)-5] // remove ".json"
|
||||
pidFile := stateDir + vdpaId + ".pid"
|
||||
// Map VDUSE device via qemu-storage-daemon
|
||||
err = startStorageDaemon(vdpaId, volName, pidFile, ctxVars["configPath"], readonly)
|
||||
if (err == nil)
|
||||
{
|
||||
// Add device to VDPA bus
|
||||
_, _, err = system("/sbin/vdpa", "-j", "dev", "add", "name", vdpaId, "mgmtdev", "vduse")
|
||||
if (err == nil)
|
||||
{
|
||||
// Find block device name
|
||||
var matches []string
|
||||
matches, err = filepath.Glob("/sys/bus/vdpa/devices/"+vdpaId+"/virtio*/block/*")
|
||||
if (err == nil && len(matches) == 0)
|
||||
{
|
||||
err = errors.New("/sys/bus/vdpa/devices/"+vdpaId+"/virtio*/block/* is not found")
|
||||
}
|
||||
if (err == nil)
|
||||
{
|
||||
blockdev := "/dev/"+filepath.Base(matches[0])
|
||||
_, err = os.Stat(blockdev)
|
||||
if (err == nil)
|
||||
{
|
||||
// Generate state file
|
||||
stateJSON, _ := json.Marshal(&DeviceState{
|
||||
ConfigPath: ctxVars["configPath"],
|
||||
VdpaId: vdpaId,
|
||||
Image: volName,
|
||||
Blockdev: blockdev,
|
||||
Readonly: readonly,
|
||||
PidFile: pidFile,
|
||||
})
|
||||
err = os.WriteFile(stateFile, stateJSON, 0600)
|
||||
if (err == nil)
|
||||
{
|
||||
return blockdev, vdpaId, nil
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
killErr := killByPidFile(pidFile)
|
||||
if (killErr != nil)
|
||||
{
|
||||
klog.Errorf("Failed to kill started qemu-storage-daemon: %v", killErr)
|
||||
}
|
||||
os.Remove(stateFile)
|
||||
os.Remove(pidFile)
|
||||
}
|
||||
return "", "", err
|
||||
}
|
||||
|
||||
func unmapVduse(stateDir, devicePath string)
|
||||
{
|
||||
if (len(devicePath) < 6 || devicePath[0:6] != "/dev/v")
|
||||
{
|
||||
klog.Errorf("%s does not start with /dev/v", devicePath)
|
||||
return
|
||||
}
|
||||
vduseDev, err := os.Readlink("/sys/block/"+devicePath[5:])
|
||||
if (err != nil)
|
||||
{
|
||||
klog.Errorf("%s is not a symbolic link to VDUSE device (../devices/virtual/vduse/xxx): %v", devicePath, err)
|
||||
return
|
||||
}
|
||||
vdpaId := ""
|
||||
p := strings.Index(vduseDev, "/vduse/")
|
||||
if (p >= 0)
|
||||
{
|
||||
vduseDev = vduseDev[p+7:]
|
||||
p = strings.Index(vduseDev, "/")
|
||||
if (p >= 0)
|
||||
{
|
||||
vdpaId = vduseDev[0:p]
|
||||
}
|
||||
}
|
||||
if (vdpaId == "")
|
||||
{
|
||||
klog.Errorf("%s is not a symbolic link to VDUSE device (../devices/virtual/vduse/xxx), but is %v", devicePath, vduseDev)
|
||||
return
|
||||
}
|
||||
unmapVduseById(stateDir, vdpaId)
|
||||
}
|
||||
|
||||
func unmapVduseById(stateDir, vdpaId string)
|
||||
{
|
||||
_, err := os.Stat("/sys/bus/vdpa/devices/"+vdpaId)
|
||||
if (err != nil)
|
||||
{
|
||||
klog.Errorf("failed to stat /sys/bus/vdpa/devices/"+vdpaId+": %v", err)
|
||||
}
|
||||
else
|
||||
{
|
||||
_, _, _ = system("/sbin/vdpa", "-j", "dev", "del", vdpaId)
|
||||
}
|
||||
stateFile := stateDir + vdpaId + ".json"
|
||||
os.Remove(stateFile)
|
||||
pidFile := stateDir + vdpaId + ".pid"
|
||||
_, err = os.Stat(pidFile)
|
||||
if (os.IsNotExist(err))
|
||||
{
|
||||
// ok, already killed
|
||||
}
|
||||
else if (err != nil)
|
||||
{
|
||||
klog.Errorf("Failed to stat %v: %v", pidFile, err)
|
||||
return
|
||||
}
|
||||
else
|
||||
{
|
||||
err = killByPidFile(pidFile)
|
||||
if (err != nil)
|
||||
{
|
||||
klog.Errorf("Failed to kill started qemu-storage-daemon: %v", err)
|
||||
}
|
||||
os.Remove(pidFile)
|
||||
}
|
||||
}
|
|
@ -1,4 +1,4 @@
|
|||
vitastor (1.5.0-1) unstable; urgency=medium
|
||||
vitastor (1.6.1-1) unstable; urgency=medium
|
||||
|
||||
* Bugfixes
|
||||
|
||||
|
|
|
@ -2,7 +2,7 @@ Source: vitastor
|
|||
Section: admin
|
||||
Priority: optional
|
||||
Maintainer: Vitaliy Filippov <vitalif@yourcmc.ru>
|
||||
Build-Depends: debhelper, liburing-dev (>= 0.6), g++ (>= 8), libstdc++6 (>= 8), linux-libc-dev, libgoogle-perftools-dev, libjerasure-dev, libgf-complete-dev, libibverbs-dev, libisal-dev, cmake, pkg-config
|
||||
Build-Depends: debhelper, liburing-dev (>= 0.6), g++ (>= 8), libstdc++6 (>= 8), linux-libc-dev, libgoogle-perftools-dev, libjerasure-dev, libgf-complete-dev, libibverbs-dev, libisal-dev, cmake, pkg-config, libnl-3-dev, libnl-genl-3-dev
|
||||
Standards-Version: 4.5.0
|
||||
Homepage: https://vitastor.io/
|
||||
Rules-Requires-Root: no
|
||||
|
|
|
@ -25,7 +25,7 @@ RUN apt-get update
|
|||
RUN apt-get -y install fio liburing-dev libgoogle-perftools-dev devscripts
|
||||
RUN apt-get -y build-dep fio
|
||||
RUN apt-get --download-only source fio
|
||||
RUN apt-get update && apt-get -y install libjerasure-dev cmake libibverbs-dev libisal-dev
|
||||
RUN apt-get update && apt-get -y install libjerasure-dev cmake libibverbs-dev libisal-dev libnl-3-dev libnl-genl-3-dev
|
||||
|
||||
ADD . /root/vitastor
|
||||
RUN set -e -x; \
|
||||
|
@ -37,8 +37,8 @@ RUN set -e -x; \
|
|||
mkdir -p /root/packages/vitastor-$REL; \
|
||||
rm -rf /root/packages/vitastor-$REL/*; \
|
||||
cd /root/packages/vitastor-$REL; \
|
||||
cp -r /root/vitastor vitastor-1.5.0; \
|
||||
cd vitastor-1.5.0; \
|
||||
cp -r /root/vitastor vitastor-1.6.1; \
|
||||
cd vitastor-1.6.1; \
|
||||
ln -s /root/fio-build/fio-*/ ./fio; \
|
||||
FIO=$(head -n1 fio/debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
|
||||
ls /usr/include/linux/raw.h || cp ./debian/raw.h /usr/include/linux/raw.h; \
|
||||
|
@ -51,8 +51,8 @@ RUN set -e -x; \
|
|||
rm -rf a b; \
|
||||
echo "dep:fio=$FIO" > debian/fio_version; \
|
||||
cd /root/packages/vitastor-$REL; \
|
||||
tar --sort=name --mtime='2020-01-01' --owner=0 --group=0 --exclude=debian -cJf vitastor_1.5.0.orig.tar.xz vitastor-1.5.0; \
|
||||
cd vitastor-1.5.0; \
|
||||
tar --sort=name --mtime='2020-01-01' --owner=0 --group=0 --exclude=debian -cJf vitastor_1.6.1.orig.tar.xz vitastor-1.6.1; \
|
||||
cd vitastor-1.6.1; \
|
||||
V=$(head -n1 debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
|
||||
DEBFULLNAME="Vitaliy Filippov <vitalif@yourcmc.ru>" dch -D $REL -v "$V""$REL" "Rebuild for $REL"; \
|
||||
DEB_BUILD_OPTIONS=nocheck dpkg-buildpackage --jobs=auto -sa; \
|
||||
|
|
|
@ -11,6 +11,7 @@ affect their interaction with the cluster.
|
|||
|
||||
- [client_retry_interval](#client_retry_interval)
|
||||
- [client_eio_retry_interval](#client_eio_retry_interval)
|
||||
- [client_retry_enospc](#client_retry_enospc)
|
||||
- [client_max_dirty_bytes](#client_max_dirty_bytes)
|
||||
- [client_max_dirty_ops](#client_max_dirty_ops)
|
||||
- [client_enable_writeback](#client_enable_writeback)
|
||||
|
@ -20,6 +21,7 @@ affect their interaction with the cluster.
|
|||
- [nbd_timeout](#nbd_timeout)
|
||||
- [nbd_max_devices](#nbd_max_devices)
|
||||
- [nbd_max_part](#nbd_max_part)
|
||||
- [osd_nearfull_ratio](#osd_nearfull_ratio)
|
||||
|
||||
## client_retry_interval
|
||||
|
||||
|
@ -41,6 +43,15 @@ Retry time for I/O requests failed due to data corruption or unfinished
|
|||
EC object deletions (has_incomplete PG state). 0 disables such retries
|
||||
and clients are not blocked and just get EIO error code instead.
|
||||
|
||||
## client_retry_enospc
|
||||
|
||||
- Type: boolean
|
||||
- Default: true
|
||||
- Can be changed online: yes
|
||||
|
||||
Retry writes on out of space errors to wait until some space is freed on
|
||||
OSDs.
|
||||
|
||||
## client_max_dirty_bytes
|
||||
|
||||
- Type: integer
|
||||
|
@ -157,3 +168,18 @@ Maximum number of NBD devices in the system. This value is passed as
|
|||
Maximum number of partitions per NBD device. This value is passed as
|
||||
`max_part` parameter for the nbd kernel module when vitastor-nbd autoloads it.
|
||||
Note that (nbds_max)*(1+max_part) usually can't exceed 256.
|
||||
|
||||
## osd_nearfull_ratio
|
||||
|
||||
- Type: number
|
||||
- Default: 0.95
|
||||
- Can be changed online: yes
|
||||
|
||||
Ratio of used space on OSD to treat it as "almost full" in vitastor-cli status output.
|
||||
|
||||
Remember that some client writes may hang or complete with an error if even
|
||||
just one OSD becomes 100 % full!
|
||||
|
||||
However, unlike in Ceph, 100 % full Vitastor OSDs don't crash (in Ceph they're
|
||||
unable to start at all), so you'll be able to recover from "out of space" errors
|
||||
without destroying and recreating OSDs.
|
||||
|
|
|
@ -11,6 +11,7 @@
|
|||
|
||||
- [client_retry_interval](#client_retry_interval)
|
||||
- [client_eio_retry_interval](#client_eio_retry_interval)
|
||||
- [client_retry_enospc](#client_retry_enospc)
|
||||
- [client_max_dirty_bytes](#client_max_dirty_bytes)
|
||||
- [client_max_dirty_ops](#client_max_dirty_ops)
|
||||
- [client_enable_writeback](#client_enable_writeback)
|
||||
|
@ -20,6 +21,7 @@
|
|||
- [nbd_timeout](#nbd_timeout)
|
||||
- [nbd_max_devices](#nbd_max_devices)
|
||||
- [nbd_max_part](#nbd_max_part)
|
||||
- [osd_nearfull_ratio](#osd_nearfull_ratio)
|
||||
|
||||
## client_retry_interval
|
||||
|
||||
|
@ -42,6 +44,15 @@
|
|||
0 отключает повторы таких запросов и клиенты не блокируются, а вместо
|
||||
этого просто получают код ошибки EIO.
|
||||
|
||||
## client_retry_enospc
|
||||
|
||||
- Тип: булево (да/нет)
|
||||
- Значение по умолчанию: true
|
||||
- Можно менять на лету: да
|
||||
|
||||
Повторять запросы записи, завершившиеся с ошибками нехватки места, т.е.
|
||||
ожидать, пока на OSD не освободится место.
|
||||
|
||||
## client_max_dirty_bytes
|
||||
|
||||
- Тип: целое число
|
||||
|
@ -158,3 +169,20 @@
|
|||
Максимальное число разделов на одном NBD-устройстве. Данное значение передаётся
|
||||
модулю ядра nbd как параметр `max_part`, когда его загружает vitastor-nbd.
|
||||
Имейте в виду, что (nbds_max)*(1+max_part) обычно не может превышать 256.
|
||||
|
||||
## osd_nearfull_ratio
|
||||
|
||||
- Тип: число
|
||||
- Значение по умолчанию: 0.95
|
||||
- Можно менять на лету: да
|
||||
|
||||
Доля занятого места на OSD, начиная с которой он считается "почти заполненным" в
|
||||
выводе vitastor-cli status.
|
||||
|
||||
Помните, что часть клиентских запросов может зависнуть или завершиться с ошибкой,
|
||||
если на 100 % заполнится хотя бы 1 OSD!
|
||||
|
||||
Однако, в отличие от Ceph, заполненные на 100 % OSD Vitastor не падают (в Ceph
|
||||
заполненные на 100% OSD вообще не могут стартовать), так что вы сможете
|
||||
восстановить работу кластера после ошибок отсутствия свободного места
|
||||
без уничтожения и пересоздания OSD.
|
||||
|
|
|
@ -15,6 +15,7 @@ These parameters only apply to Monitors.
|
|||
- [mon_stats_timeout](#mon_stats_timeout)
|
||||
- [osd_out_time](#osd_out_time)
|
||||
- [placement_levels](#placement_levels)
|
||||
- [use_old_pg_combinator](#use_old_pg_combinator)
|
||||
|
||||
## etcd_mon_ttl
|
||||
|
||||
|
@ -77,3 +78,11 @@ values. Smaller priority means higher level in tree. For example,
|
|||
levels are always predefined and can't be removed. If one of them is not
|
||||
present in the configuration, then it is defined with the default priority
|
||||
(100 for "host", 101 for "osd").
|
||||
|
||||
## use_old_pg_combinator
|
||||
|
||||
- Type: boolean
|
||||
- Default: false
|
||||
|
||||
Use the old PG combination generator which doesn't support [level_placement](pool.en.md#level_placement)
|
||||
and [raw_placement](pool.en.md#raw_placement) for pools which don't use this features.
|
||||
|
|
|
@ -15,6 +15,7 @@
|
|||
- [mon_stats_timeout](#mon_stats_timeout)
|
||||
- [osd_out_time](#osd_out_time)
|
||||
- [placement_levels](#placement_levels)
|
||||
- [use_old_pg_combinator](#use_old_pg_combinator)
|
||||
|
||||
## etcd_mon_ttl
|
||||
|
||||
|
@ -78,3 +79,11 @@ OSD перед обновлением агрегированной статис
|
|||
"host" и "osd" являются предопределёнными и не могут быть удалены. Если
|
||||
один из них отсутствует в конфигурации, он доопределяется с приоритетом по
|
||||
умолчанию (100 для уровня "host", 101 для "osd").
|
||||
|
||||
## use_old_pg_combinator
|
||||
|
||||
- Тип: булево (да/нет)
|
||||
- Значение по умолчанию: false
|
||||
|
||||
Использовать старый генератор комбинаций PG, не поддерживающий [level_placement](pool.ru.md#level_placement)
|
||||
и [raw_placement](pool.ru.md#raw_placement) для пулов, которые не используют данные функции.
|
||||
|
|
|
@ -29,7 +29,7 @@ between clients, OSDs and etcd.
|
|||
- [etcd_quick_timeout](#etcd_quick_timeout)
|
||||
- [etcd_slow_timeout](#etcd_slow_timeout)
|
||||
- [etcd_keepalive_timeout](#etcd_keepalive_timeout)
|
||||
- [etcd_ws_keepalive_timeout](#etcd_ws_keepalive_timeout)
|
||||
- [etcd_ws_keepalive_interval](#etcd_ws_keepalive_interval)
|
||||
|
||||
## tcp_header_buffer_size
|
||||
|
||||
|
@ -245,7 +245,7 @@ Timeout for etcd requests which are allowed to wait for some time.
|
|||
Timeout for etcd connection HTTP Keep-Alive. Should be higher than
|
||||
etcd_report_interval to guarantee that keepalive actually works.
|
||||
|
||||
## etcd_ws_keepalive_timeout
|
||||
## etcd_ws_keepalive_interval
|
||||
|
||||
- Type: seconds
|
||||
- Default: 30
|
||||
|
|
|
@ -29,7 +29,7 @@
|
|||
- [etcd_quick_timeout](#etcd_quick_timeout)
|
||||
- [etcd_slow_timeout](#etcd_slow_timeout)
|
||||
- [etcd_keepalive_timeout](#etcd_keepalive_timeout)
|
||||
- [etcd_ws_keepalive_timeout](#etcd_ws_keepalive_timeout)
|
||||
- [etcd_ws_keepalive_interval](#etcd_ws_keepalive_interval)
|
||||
|
||||
## tcp_header_buffer_size
|
||||
|
||||
|
@ -256,7 +256,7 @@ OSD в любом случае согласовывают реальное зн
|
|||
Таймаут для HTTP Keep-Alive в соединениях к etcd. Должен быть больше, чем
|
||||
etcd_report_interval, чтобы keepalive гарантированно работал.
|
||||
|
||||
## etcd_ws_keepalive_timeout
|
||||
## etcd_ws_keepalive_interval
|
||||
|
||||
- Тип: секунды
|
||||
- Значение по умолчанию: 30
|
||||
|
|
|
@ -32,6 +32,8 @@ Parameters:
|
|||
- [pg_minsize](#pg_minsize)
|
||||
- [pg_count](#pg_count)
|
||||
- [failure_domain](#failure_domain)
|
||||
- [level_placement](#level_placement)
|
||||
- [raw_placement](#raw_placement)
|
||||
- [max_osd_combinations](#max_osd_combinations)
|
||||
- [block_size](#block_size)
|
||||
- [bitmap_granularity](#bitmap_granularity)
|
||||
|
@ -84,7 +86,11 @@ Parent node reference is required for intermediate tree nodes.
|
|||
Separate OSD settings are set in etc keys `/vitastor/config/osd/<number>`
|
||||
in JSON format `{"<key>":<value>}`.
|
||||
|
||||
As of now, two settings are supported:
|
||||
As of now, the following settings are supported:
|
||||
|
||||
- [reweight](#reweight)
|
||||
- [tags](#tags)
|
||||
- [noout](#noout)
|
||||
|
||||
## reweight
|
||||
|
||||
|
@ -107,6 +113,14 @@ subsets and then use a specific subset for pool instead of all OSDs.
|
|||
For example you can mark SSD OSDs with tag "ssd" and HDD OSDs with "hdd" and
|
||||
such tags will work as device classes.
|
||||
|
||||
## noout
|
||||
|
||||
- Type: boolean
|
||||
- Default: false
|
||||
|
||||
If set to true, [osd_out_time](monitor.en.md#osd_out_time) is ignored for this
|
||||
OSD and it's never removed from data distribution by the monitor.
|
||||
|
||||
# Pool parameters
|
||||
|
||||
## name
|
||||
|
@ -209,6 +223,69 @@ never put on OSDs in the same failure domain (for example, on the same host).
|
|||
So failure domain specifies the unit which failure you are protecting yourself
|
||||
from.
|
||||
|
||||
## level_placement
|
||||
|
||||
- Type: string
|
||||
|
||||
Additional failure domain rules, applied in conjuction with failure_domain.
|
||||
Must be specified in the following form:
|
||||
|
||||
`<placement level>=<sequence of characters>, <level2>=<sequence2>, ...`
|
||||
|
||||
Sequence should be exactly [pg_size](#pg_size) character long. Each character
|
||||
corresponds to an OSD in the PG of this pool. Equal characters mean that
|
||||
corresponding items of the PG should be placed into the same placement tree
|
||||
item at this level. Different characters mean that items should be placed into
|
||||
different items.
|
||||
|
||||
For example, if you want a EC 4+2 pool and you want every 2 chunks to be stored
|
||||
in its own datacenter and you also want each chunk to be stored on a different
|
||||
host, you should set `level_placement` to `dc=112233 host=123456`.
|
||||
|
||||
Or you can set `level_placement` to `dc=112233` and leave `failure_domain` empty,
|
||||
because `host` is the default `failure_domain` and it will be applied anyway.
|
||||
|
||||
Without this rule, it may happen that 3 chunks will be stored on OSDs in the
|
||||
same datacenter, and the data will become inaccessibly if that datacenter goes
|
||||
down in this case.
|
||||
|
||||
Of course, you should group your hosts into datacenters before applying the rule
|
||||
by setting [placement_levels](monitor.en.md#placement_levels) to something like
|
||||
`{"dc":90,"host":100,"osd":110}` and add DCs to [node_placement](#placement-tree),
|
||||
like `{"dc1":{"level":"dc"},"host1":{"parent":"dc1"},...}`.
|
||||
|
||||
## raw_placement
|
||||
|
||||
- Type: string
|
||||
|
||||
Raw PG placement rules, specified in the form of a DSL (domain-specific language).
|
||||
Use only if you really know what you're doing :)
|
||||
|
||||
DSL specification:
|
||||
|
||||
```
|
||||
dsl := item | item ("\n" | ",") items
|
||||
item := "any" | rules
|
||||
rules := rule | rule rules
|
||||
rule := level operator arg
|
||||
level := /\w+/
|
||||
operator := "!=" | "=" | ">" | "?="
|
||||
arg := value | "(" values ")"
|
||||
values := value | value "," values
|
||||
value := item_ref | constant_id
|
||||
item_ref := /\d+/
|
||||
constant_id := /"([^"]+)"/
|
||||
```
|
||||
|
||||
"?=" operator means "preferred". I.e. `dc ?= "meow"` means "prefer datacenter meow
|
||||
for this chunk, but put into another dc if it's unavailable".
|
||||
|
||||
Examples:
|
||||
|
||||
- Simple 3 replicas with failure_domain=host: `any, host!=1, host!=(1,2)`
|
||||
- EC 4+2 in 3 DC: `any, dc=1 host!=1, dc!=1, dc=3 host!=3, dc!=(1,3), dc=5 host!=5`
|
||||
- 1 replica in fixed DC + 2 in random DCs: `dc?=meow, dc!=1, dc!=(1,2)`
|
||||
|
||||
## max_osd_combinations
|
||||
|
||||
- Type: integer
|
||||
|
|
|
@ -31,6 +31,8 @@
|
|||
- [pg_minsize](#pg_minsize)
|
||||
- [pg_count](#pg_count)
|
||||
- [failure_domain](#failure_domain)
|
||||
- [level_placement](#level_placement)
|
||||
- [raw_placement](#raw_placement)
|
||||
- [max_osd_combinations](#max_osd_combinations)
|
||||
- [block_size](#block_size)
|
||||
- [bitmap_granularity](#bitmap_granularity)
|
||||
|
@ -83,10 +85,11 @@
|
|||
Настройки отдельных OSD задаются в ключах etcd `/vitastor/config/osd/<number>`
|
||||
в JSON-формате `{"<key>":<value>}`.
|
||||
|
||||
На данный момент поддерживаются две настройки:
|
||||
На данный момент поддерживаются следующие настройки:
|
||||
|
||||
- [reweight](#reweight)
|
||||
- [tags](#tags)
|
||||
- [noout](#noout)
|
||||
|
||||
## reweight
|
||||
|
||||
|
@ -110,6 +113,14 @@
|
|||
всех. Можно, например, пометить SSD OSD тегом "ssd", а HDD тегом "hdd", в
|
||||
этом смысле теги работают аналогично классам устройств.
|
||||
|
||||
## noout
|
||||
|
||||
- Тип: булево (да/нет)
|
||||
- Значение по умолчанию: false
|
||||
|
||||
Если установлено в true, то [osd_out_time](monitor.ru.md#osd_out_time) для этого
|
||||
OSD игнорируется и OSD не удаляется из распределения данных монитором.
|
||||
|
||||
# Параметры
|
||||
|
||||
## name
|
||||
|
@ -161,7 +172,7 @@ OSD, PG деактивируется на чтение и запись. Иным
|
|||
Для примера, разница между pg_minsize 2 и 1 в реплицированном пуле с 3 копиями
|
||||
данных (pg_size=3), проявляется следующим образом:
|
||||
- Если 2 сервера отключаются при pg_minsize=2, пул становится неактивным и
|
||||
остаётся неактивным в течение [osd_out_time](monitor.en.md#osd_out_time)
|
||||
остаётся неактивным в течение [osd_out_time](monitor.ru.md#osd_out_time)
|
||||
(10 минут), после чего монитор назначает другие OSD/серверы на замену, пул
|
||||
поднимается и начинает восстанавливать недостающие копии данных. Соответственно,
|
||||
если OSD на замену нет - то есть, если у вас всего 3 сервера с OSD и 2 из них
|
||||
|
@ -169,7 +180,7 @@ OSD, PG деактивируется на чтение и запись. Иным
|
|||
или не добавите хотя бы 1 сервер (или не переключите failure_domain на "osd").
|
||||
- Если 2 сервера отключаются при pg_minsize=1, ввод-вывод лишь приостанавливается
|
||||
на короткое время, до тех пор, пока монитор не поймёт, что OSD отключены
|
||||
(что занимает 5-10 секунд при стандартном [etcd_report_interval](osd.en.md#etcd_report_interval)).
|
||||
(что занимает 5-10 секунд при стандартном [etcd_report_interval](osd.ru.md#etcd_report_interval)).
|
||||
После этого ввод-вывод восстанавливается, но новые данные временно пишутся
|
||||
всего в 1 копии. Когда же проходит osd_out_time, монитор точно так же назначает
|
||||
другие OSD на замену выбывшим и пул начинает восстанавливать копии данных.
|
||||
|
@ -211,6 +222,71 @@ PG в Vitastor эферемерны, то есть вы можете менят
|
|||
Иными словами, домен отказа - это то, от отказа чего вы защищаете себя избыточным
|
||||
хранением.
|
||||
|
||||
## level_placement
|
||||
|
||||
- Тип: строка
|
||||
|
||||
Правила дополнительных доменов отказа, применяемые вместе с failure_domain.
|
||||
Должны задаваться в следующем виде:
|
||||
|
||||
`<уровень>=<последовательность символов>, <уровень2>=<последовательность2>, ...`
|
||||
|
||||
Каждая `<последовательность>` должна состоять ровно из [pg_size](#pg_size) символов.
|
||||
Каждый символ соответствует одному OSD (размещению одной части PG) этого пула.
|
||||
Одинаковые символы означают, что соответствующие части размещаются в один и тот же
|
||||
узел дерева OSD на заданном `<уровне>`. Разные символы означают, что части
|
||||
размещаются в разные узлы.
|
||||
|
||||
Например, если вы хотите сделать пул EC 4+2 и хотите поместить каждые 2 части
|
||||
данных в свой датацентр, и также вы хотите, чтобы каждая часть размещалась на
|
||||
другом хосте, то вы должны задать `level_placement` равным `dc=112233 host=123456`.
|
||||
|
||||
Либо вы просто можете задать `level_placement` равным `dc=112233` и оставить
|
||||
`failure_domain` пустым, т.к. `host` это его значение по умолчанию и оно также
|
||||
применится автоматически.
|
||||
|
||||
Без этого правила может получиться так, что в одном из датацентров окажется
|
||||
3 части данных одной PG и данные окажутся недоступными при временном отключении
|
||||
этого датацентра.
|
||||
|
||||
Естественно, перед установкой правила вам нужно сгруппировать ваши хосты в
|
||||
датацентры, установив [placement_levels](monitor.ru.md#placement_levels) во что-то
|
||||
типа `{"dc":90,"host":100,"osd":110}` и добавив датацентры в [node_placement](#дерево-размещения),
|
||||
примерно так: `{"dc1":{"level":"dc"},"host1":{"parent":"dc1"},...}`.
|
||||
|
||||
## raw_placement
|
||||
|
||||
- Type: string
|
||||
|
||||
Низкоуровневые правила генерации PG в форме DSL (доменно-специфичного языка).
|
||||
Используйте, только если действительно знаете, зачем вам это надо :)
|
||||
|
||||
Спецификация DSL:
|
||||
|
||||
```
|
||||
dsl := item | item ("\n" | ",") items
|
||||
item := "any" | rules
|
||||
rules := rule | rule rules
|
||||
rule := level operator arg
|
||||
level := /\w+/
|
||||
operator := "!=" | "=" | ">" | "?="
|
||||
arg := value | "(" values ")"
|
||||
values := value | value "," values
|
||||
value := item_ref | constant_id
|
||||
item_ref := /\d+/
|
||||
constant_id := /"([^"]+)"/
|
||||
```
|
||||
|
||||
Оператор "?=" означает "предпочитаемый". Т.е. `dc ?= "meow"` означает "предпочитать
|
||||
датацентр meow для этой части данных, но разместить её в другом датацентре, если
|
||||
meow недоступен".
|
||||
|
||||
Примеры:
|
||||
|
||||
- Простые 3 реплики с failure_domain=host: `any, host!=1, host!=(1,2)`
|
||||
- EC 4+2 в 3 датацентрах: `any, dc=1 host!=1, dc!=1, dc=3 host!=3, dc!=(1,3), dc=5 host!=5`
|
||||
- 1 копия в фиксированном ДЦ + 2 в других ДЦ: `dc?=meow, dc!=1, dc!=(1,2)`
|
||||
|
||||
## max_osd_combinations
|
||||
|
||||
- Тип: целое число
|
||||
|
|
|
@ -22,6 +22,16 @@
|
|||
или незавершённых удалений EC-объектов (состояния PG has_incomplete).
|
||||
0 отключает повторы таких запросов и клиенты не блокируются, а вместо
|
||||
этого просто получают код ошибки EIO.
|
||||
- name: client_retry_enospc
|
||||
type: bool
|
||||
default: true
|
||||
online: true
|
||||
info: |
|
||||
Retry writes on out of space errors to wait until some space is freed on
|
||||
OSDs.
|
||||
info_ru: |
|
||||
Повторять запросы записи, завершившиеся с ошибками нехватки места, т.е.
|
||||
ожидать, пока на OSD не освободится место.
|
||||
- name: client_max_dirty_bytes
|
||||
type: int
|
||||
default: 33554432
|
||||
|
@ -190,3 +200,27 @@
|
|||
Максимальное число разделов на одном NBD-устройстве. Данное значение передаётся
|
||||
модулю ядра nbd как параметр `max_part`, когда его загружает vitastor-nbd.
|
||||
Имейте в виду, что (nbds_max)*(1+max_part) обычно не может превышать 256.
|
||||
- name: osd_nearfull_ratio
|
||||
type: float
|
||||
default: 0.95
|
||||
online: true
|
||||
info: |
|
||||
Ratio of used space on OSD to treat it as "almost full" in vitastor-cli status output.
|
||||
|
||||
Remember that some client writes may hang or complete with an error if even
|
||||
just one OSD becomes 100 % full!
|
||||
|
||||
However, unlike in Ceph, 100 % full Vitastor OSDs don't crash (in Ceph they're
|
||||
unable to start at all), so you'll be able to recover from "out of space" errors
|
||||
without destroying and recreating OSDs.
|
||||
info_ru: |
|
||||
Доля занятого места на OSD, начиная с которой он считается "почти заполненным" в
|
||||
выводе vitastor-cli status.
|
||||
|
||||
Помните, что часть клиентских запросов может зависнуть или завершиться с ошибкой,
|
||||
если на 100 % заполнится хотя бы 1 OSD!
|
||||
|
||||
Однако, в отличие от Ceph, заполненные на 100 % OSD Vitastor не падают (в Ceph
|
||||
заполненные на 100% OSD вообще не могут стартовать), так что вы сможете
|
||||
восстановить работу кластера после ошибок отсутствия свободного места
|
||||
без уничтожения и пересоздания OSD.
|
||||
|
|
|
@ -56,6 +56,8 @@
|
|||
|
||||
{{../../usage/nfs.en.md}}
|
||||
|
||||
{{../../usage/admin.en.md}}
|
||||
|
||||
## Performance
|
||||
|
||||
{{../../performance/understanding.en.md}}
|
||||
|
@ -64,4 +66,6 @@
|
|||
|
||||
{{../../performance/comparison1.en.md}}
|
||||
|
||||
{{../../performance/bench2.en.md}}
|
||||
|
||||
{{../../intro/author.en.md|indent=1}}
|
||||
|
|
|
@ -56,6 +56,8 @@
|
|||
|
||||
{{../../usage/nfs.ru.md}}
|
||||
|
||||
{{../../usage/admin.ru.md}}
|
||||
|
||||
## Производительность
|
||||
|
||||
{{../../performance/understanding.ru.md}}
|
||||
|
@ -64,4 +66,6 @@
|
|||
|
||||
{{../../performance/comparison1.ru.md}}
|
||||
|
||||
{{../../performance/bench2.ru.md}}
|
||||
|
||||
{{../../intro/author.ru.md|indent=1}}
|
||||
|
|
|
@ -63,3 +63,12 @@
|
|||
"host" и "osd" являются предопределёнными и не могут быть удалены. Если
|
||||
один из них отсутствует в конфигурации, он доопределяется с приоритетом по
|
||||
умолчанию (100 для уровня "host", 101 для "osd").
|
||||
- name: use_old_pg_combinator
|
||||
type: bool
|
||||
default: false
|
||||
info: |
|
||||
Use the old PG combination generator which doesn't support [level_placement](pool.en.md#level_placement)
|
||||
and [raw_placement](pool.en.md#raw_placement) for pools which don't use this features.
|
||||
info_ru: |
|
||||
Использовать старый генератор комбинаций PG, не поддерживающий [level_placement](pool.ru.md#level_placement)
|
||||
и [raw_placement](pool.ru.md#raw_placement) для пулов, которые не используют данные функции.
|
||||
|
|
|
@ -280,7 +280,7 @@
|
|||
info_ru: |
|
||||
Таймаут для HTTP Keep-Alive в соединениях к etcd. Должен быть больше, чем
|
||||
etcd_report_interval, чтобы keepalive гарантированно работал.
|
||||
- name: etcd_ws_keepalive_timeout
|
||||
- name: etcd_ws_keepalive_interval
|
||||
type: sec
|
||||
default: 30
|
||||
online: true
|
||||
|
|
|
@ -13,7 +13,7 @@
|
|||
## Server-side features
|
||||
|
||||
- Basic part: highly-available block storage with symmetric clustering and no SPOF
|
||||
- [Performance](../performance/comparison1.en.md) ;-D
|
||||
- [Performance](../performance/bench2.en.md) ;-D
|
||||
- [Multiple redundancy schemes](../config/pool.en.md#scheme): Replication, XOR n+1, Reed-Solomon erasure codes
|
||||
based on jerasure and ISA-L libraries with any number of data and parity drives in a group
|
||||
- Configuration via simple JSON data structures in etcd (parameters, pools and images)
|
||||
|
|
|
@ -13,7 +13,7 @@
|
|||
## Серверные функции
|
||||
|
||||
- Базовая часть - надёжное кластерное блочное хранилище без единой точки отказа
|
||||
- [Производительность](../performance/comparison1.ru.md) ;-D
|
||||
- [Производительность](../performance/bench2.ru.md) ;-D
|
||||
- [Несколько схем отказоустойчивости](../config/pool.ru.md#scheme): репликация, XOR n+1 (1 диск чётности), коды коррекции ошибок
|
||||
Рида-Соломона на основе библиотек jerasure и ISA-L с любым числом дисков данных и чётности в группе
|
||||
- Конфигурация через простые человекочитаемые JSON-структуры в etcd
|
||||
|
|
|
@ -0,0 +1,154 @@
|
|||
[Documentation](../../README.md#documentation) → Performance → Newer benchmark of Vitastor 1.3.1
|
||||
|
||||
-----
|
||||
|
||||
[Читать на русском](bench2.ru.md)
|
||||
|
||||
# Newer benchmark of Vitastor 1.3.1
|
||||
|
||||
- [Test environment](#test-environment)
|
||||
- [Notes](#notes)
|
||||
- [Raw drive performance](#raw-drive-performance)
|
||||
- [2 replicas](#2-replicas)
|
||||
- [3 replicas](#3-replicas)
|
||||
- [EC 2+1](#ec-2-1)
|
||||
|
||||
## Test environment
|
||||
|
||||
Hardware configuration: 3 nodes, each with:
|
||||
- 8x NVMe Samsung PM9A3 1.92 TB
|
||||
- 2x Xeon Gold 6342 (24 cores @ 2.8 GHz)
|
||||
- 256 GB RAM
|
||||
- Dual-port 25 GbE Mellanox ConnectX-4 LX network card with RoCEv2
|
||||
- Connected to 2 Mellanox SN2010 switches with MLAG
|
||||
|
||||
## Notes
|
||||
|
||||
Vitastor version was 1.3.1.
|
||||
|
||||
Tests were ran from the storage nodes - 4 fio clients per each of 3 nodes.
|
||||
|
||||
The same large 3 TB image was tested from all hosts because Vitastor has no
|
||||
performance penalties related to running multiple clients over a single inode.
|
||||
|
||||
CPU power saving was disabled. 4 OSDs were created per each NVMe.
|
||||
Checksums were not enabled. Tests with checksums will be conducted later,
|
||||
along with the newer version of Vitastor, and results will be updated.
|
||||
|
||||
CPU configuration was not optimal because of NUMA. It's better to avoid 2-socket
|
||||
platforms. It was especially noticeable in RDMA tests - in the form of ksoftirqd
|
||||
processes (usually 1 per server) eating 100 % of one CPU core and actual bandwidth
|
||||
of one network port reduced to 3-5 Gbit/s instead of 25 Gbit/s - probably because
|
||||
of RFS (Receive Flow Steering) misses. Many network configurations were tried during
|
||||
tests, but nothing helped to solve this problem, so final tests were conducted with
|
||||
the default settings.
|
||||
|
||||
# Raw drive performance
|
||||
|
||||
- Linear write ~1000-2000 MB/s, depending on current state of the drive's garbage collector
|
||||
- Linear read ~3300 MB/s
|
||||
- T1Q1 random write ~60000 iops (latency ~0.015ms)
|
||||
- T1Q1 random read ~14700 iops (latency ~0.066ms)
|
||||
- T1Q16 random write ~180000 iops
|
||||
- T1Q16 random read ~120000 iops
|
||||
- T1Q32 random write ~180000 iops
|
||||
- T1Q32 random read ~195000 iops
|
||||
- T1Q128 random write ~180000 iops
|
||||
- T1Q128 random read ~195000 iops
|
||||
- T4Q128 random write ~525000 iops
|
||||
- T4Q128 random read ~750000 iops
|
||||
|
||||
These numbers make obvious that results could be much better if a faster network
|
||||
was available, because NVMe drives obviously weren't a bottleneck. For example,
|
||||
theoretical maximum linear read performance for 24 drives could be 79.2 GByte/s,
|
||||
which is 633 Gbit/s. Real Vitastor read speed (both linear and random) was around
|
||||
16 Gbyte/s, which is 130 Gbit/s. It's important to note that it was still much
|
||||
larger than the network bandwidth of one server (50 Gbit/s). This is also correct
|
||||
because tests were conducted from all 3 nodes.
|
||||
|
||||
## 2 replicas
|
||||
|
||||
| | TCP | RDMA |
|
||||
|------------------------------|--------------|--------------|
|
||||
| Linear read (4M T6 Q16) | 13.13 GB/s | 16.25 GB/s |
|
||||
| Linear write (4M T6 Q16) | 8.16 GB/s | 7.88 GB/s |
|
||||
| Read 4k T1 Q1 | 8745 iops | 10252 iops |
|
||||
| Write 4k T1 Q1 | 8097 iops | 11488 iops |
|
||||
| Read 4k T12 Q128 | 1305936 iops | 4265861 iops |
|
||||
| Write 4k T12 Q128 | 660490 iops | 1384033 iops |
|
||||
|
||||
CPU consumption OSD per 1 disk:
|
||||
|
||||
| | TCP | RDMA |
|
||||
|------------------------------|---------|---------|
|
||||
| Linear read (4M T6 Q16) | 29.7 % | 29.8 % |
|
||||
| Linear write (4M T6 Q16) | 84.4 % | 33.2 % |
|
||||
| Read 4k T12 Q128 | 98.4 % | 119.1 % |
|
||||
| Write 4k T12 Q128 | 173.4 % | 175.9 % |
|
||||
|
||||
CPU consumption per 1 client (fio):
|
||||
|
||||
| | TCP | RDMA |
|
||||
|------------------------------|--------|--------|
|
||||
| Linear read (4M T6 Q16) | 100 % | 85.2 % |
|
||||
| Linear write (4M T6 Q16) | 55.8 % | 48.8 % |
|
||||
| Read 4k T12 Q128 | 99.9 % | 96 % |
|
||||
| Write 4k T12 Q128 | 71.6 % | 48.5 % |
|
||||
|
||||
## 3 replicas
|
||||
|
||||
| | TCP | RDMA |
|
||||
|------------------------------|--------------|--------------|
|
||||
| Linear read (4M T6 Q16) | 13.98 GB/s | 16.54 GB/s |
|
||||
| Linear write (4M T6 Q16) | 5.38 GB/s | 5.7 GB/s |
|
||||
| Read 4k T1 Q1 | 8969 iops | 9980 iops |
|
||||
| Write 4k T1 Q1 | 8126 iops | 11672 iops |
|
||||
| Read 4k T12 Q128 | 1358818 iops | 4279088 iops |
|
||||
| Write 4k T12 Q128 | 433890 iops | 993506 iops |
|
||||
|
||||
CPU consumption OSD per 1 disk:
|
||||
|
||||
| | TCP | RDMA |
|
||||
|------------------------------|--------|---------|
|
||||
| Linear read (4M T6 Q16) | 24.9 % | 25.4 % |
|
||||
| Linear write (4M T6 Q16) | 99.3 % | 38.4 % |
|
||||
| Read 4k T12 Q128 | 95.3 % | 111.7 % |
|
||||
| Write 4k T12 Q128 | 173 % | 194 % |
|
||||
|
||||
CPU consumption per 1 client (fio):
|
||||
|
||||
| | TCP | RDMA |
|
||||
|------------------------------|--------|--------|
|
||||
| Linear read (4M T6 Q16) | 99.9 % | 85.8 % |
|
||||
| Linear write (4M T6 Q16) | 38.9 % | 38.1 % |
|
||||
| Read 4k T12 Q128 | 100 % | 96.1 % |
|
||||
| Write 4k T12 Q128 | 51.6 % | 41.9 % |
|
||||
|
||||
## EC 2+1
|
||||
|
||||
| | TCP | RDMA |
|
||||
|------------------------------|--------------|--------------|
|
||||
| Linear read (4M T6 Q16) | 10.07 GB/s | 11.43 GB/s |
|
||||
| Linear write (4M T6 Q16) | 7.74 GB/s | 8.32 GB/s |
|
||||
| Read 4k T1 Q1 | 7408 iops | 8891 iops |
|
||||
| Write 4k T1 Q1 | 3525 iops | 4903 iops |
|
||||
| Read 4k T12 Q128 | 1216496 iops | 2552765 iops |
|
||||
| Write 4k T12 Q128 | 278110 iops | 821261 iops |
|
||||
|
||||
CPU consumption OSD per 1 disk:
|
||||
|
||||
| | TCP | RDMA |
|
||||
|------------------------------|---------|---------|
|
||||
| Linear read (4M T6 Q16) | 68.6 % | 33.6 % |
|
||||
| Linear write (4M T6 Q16) | 108.3 % | 50.2 % |
|
||||
| Read 4k T12 Q128 | 138.1 % | 97.9 % |
|
||||
| Write 4k T12 Q128 | 168.7 % | 188.5 % |
|
||||
|
||||
CPU consumption per 1 client (fio):
|
||||
|
||||
| | TCP | RDMA |
|
||||
|------------------------------|--------|--------|
|
||||
| Linear read (4M T6 Q16) | 88.2 % | 52.4 % |
|
||||
| Linear write (4M T6 Q16) | 51.8 % | 46.8 % |
|
||||
| Read 4k T12 Q128 | 99.7 % | 61.3 % |
|
||||
| Write 4k T12 Q128 | 35.1 % | 31.3 % |
|
|
@ -0,0 +1,157 @@
|
|||
[Документация](../../README-ru.md#документация) → Производительность → Более новый тест Vitastor 1.3.1
|
||||
|
||||
-----
|
||||
|
||||
[Read in English](bench2.en.md)
|
||||
|
||||
# Более новый тест Vitastor 1.3.1
|
||||
|
||||
- [Описание стенда](#описание-стенда)
|
||||
- [Примечания](#примечания)
|
||||
- [Производительность голых дисков](#производительность-голых-дисков)
|
||||
- [2 реплики](#2-реплики)
|
||||
- [3 реплики](#3-реплики)
|
||||
- [EC 2+1](#ec-2-1)
|
||||
|
||||
## Описание стенда
|
||||
|
||||
Железо: 3 сервера, в каждом:
|
||||
- 8x NVMe Samsung PM9A3 1.92 TB
|
||||
- 2x Xeon Gold 6342 (24 ядра @ 2.8 GHz)
|
||||
- 256 GB RAM
|
||||
- Двухпортовая 25 GbE сетевая карта Mellanox ConnectX-4 LX с поддержкой RoCEv2
|
||||
- Подключение к 2 коммутаторам Mellanox SN2010 в MLAG
|
||||
|
||||
## Примечания
|
||||
|
||||
Версия Vitastor 1.3.1.
|
||||
|
||||
Тесты проводились с самих серверов хранения - по 4 клиента fio с каждого из 3 серверов.
|
||||
|
||||
Тестировался один большой образ размером 3 ТБ со всех хостов - создавать отдельные образы
|
||||
для тестов в Vitastor необязательно, т.к. в Vitastor нет замедления при записи в один
|
||||
узел несколькими клиентами.
|
||||
|
||||
Экономия энергии CPU отключена. На каждый NVMe создавалось 4 OSD.
|
||||
Контрольные суммы не включались. Тесты с контрольными суммами будут проведены
|
||||
позднее. Тогда же будет протестирована более новая версия Vitastor, и результаты
|
||||
будут обновлены.
|
||||
|
||||
Конфигурация CPU стенда неоптимальна из-за NUMA - двухпроцессорных серверов лучше
|
||||
избегать. Особенно это проявлялось во время тестов с RDMA - выражалось это в потреблении
|
||||
100% одного ядра CPU одним процессом ksoftirqd и работой одного из двух сетевых портов
|
||||
на скорости 3-5 ГБит/с вместо 25 ГБит/с - предположительно, из-за "непопаданий" RFS
|
||||
(Receive Flow Steering) на нужные ядра. Решить эту проблему во время проведения тестов
|
||||
не получилось. Было перепробовано множество различных настроек, но в итоге тесты проведены
|
||||
с настройками по умолчанию, т.к. улучшения добиться не удалось.
|
||||
|
||||
# Производительность голых дисков
|
||||
|
||||
- Линейная запись ~1000-2000 МБ/с, в зависимости от состояния сборщика мусора диска
|
||||
- Линейное чтение ~3300 МБ/с
|
||||
- T1Q1 запись ~60000 iops (задержка ~0.015ms)
|
||||
- T1Q1 чтение ~14700 iops (задержка ~0.066ms)
|
||||
- T1Q16 запись ~180000 iops
|
||||
- T1Q16 чтение ~120000 iops
|
||||
- T1Q32 запись ~180000 iops
|
||||
- T1Q32 чтение ~195000 iops
|
||||
- T1Q128 запись ~180000 iops
|
||||
- T1Q128 чтение ~195000 iops
|
||||
- T4Q128 запись ~525000 iops
|
||||
- T4Q128 чтение ~750000 iops
|
||||
|
||||
Из данных цифр очевидно, что при наличии более быстрой сети результаты были бы
|
||||
значительно лучше, так как в диски тест, очевидно, не упирался. Например, теоретический предел по
|
||||
линейному чтению для 24 таких дисков был бы около 79.2 ГБайт/с, то есть,
|
||||
633 гигабита в секунду. Реальная скорость чтения (и случайного, и линейного)
|
||||
Vitastor составила примерно 16 ГБайт/с, то есть 130 гигабит в секунду. При этом
|
||||
следует заметить, что этот результат всё равно значительно лучше пропускной способности
|
||||
сети отдельно взятого узла - что тоже вполне логично, так как тест выполнялся со
|
||||
всех трёх узлов.
|
||||
|
||||
## 2 реплики
|
||||
|
||||
| | TCP | RDMA |
|
||||
|------------------------------|--------------|--------------|
|
||||
| Линейное чтение (4M T6 Q16) | 13.13 ГБ/с | 16.25 ГБ/с |
|
||||
| Линейная запись (4M T6 Q16) | 8.16 ГБ/с | 7.88 ГБ/с |
|
||||
| Чтение 4k T1 Q1 | 8745 iops | 10252 iops |
|
||||
| Запись 4k T1 Q1 | 8097 iops | 11488 iops |
|
||||
| Чтение 4k T12 Q128 | 1305936 iops | 4265861 iops |
|
||||
| Запись 4k T12 Q128 | 660490 iops | 1384033 iops |
|
||||
|
||||
Потребление CPU OSD на 1 диск:
|
||||
|
||||
| | TCP | RDMA |
|
||||
|------------------------------|---------|---------|
|
||||
| Линейное чтение (4M T6 Q16) | 29.7 % | 29.8 % |
|
||||
| Линейная запись (4M T6 Q16) | 84.4 % | 33.2 % |
|
||||
| Чтение 4k T12 Q128 | 98.4 % | 119.1 % |
|
||||
| Запись 4k T12 Q128 | 173.4 % | 175.9 % |
|
||||
|
||||
Потребление CPU на 1 клиента (fio):
|
||||
|
||||
| | TCP | RDMA |
|
||||
|------------------------------|--------|--------|
|
||||
| Линейное чтение (4M T6 Q16) | 100 % | 85.2 % |
|
||||
| Линейная запись (4M T6 Q16) | 55.8 % | 48.8 % |
|
||||
| Чтение 4k T12 Q128 | 99.9 % | 96 % |
|
||||
| Запись 4k T12 Q128 | 71.6 % | 48.5 % |
|
||||
|
||||
## 3 реплики
|
||||
|
||||
| | TCP | RDMA |
|
||||
|------------------------------|--------------|--------------|
|
||||
| Линейное чтение (4M T6 Q16) | 13.98 ГБ/с | 16.54 ГБ/с |
|
||||
| Линейная запись (4M T6 Q16) | 5.38 ГБ/с | 5.7 ГБ/с |
|
||||
| Чтение 4k T1 Q1 | 8969 iops | 9980 iops |
|
||||
| Запись 4k T1 Q1 | 8126 iops | 11672 iops |
|
||||
| Чтение 4k T12 Q128 | 1358818 iops | 4279088 iops |
|
||||
| Запись 4k T12 Q128 | 433890 iops | 993506 iops |
|
||||
|
||||
Потребление CPU OSD на 1 диск:
|
||||
|
||||
| | TCP | RDMA |
|
||||
|------------------------------|--------|---------|
|
||||
| Линейное чтение (4M T6 Q16) | 24.9 % | 25.4 % |
|
||||
| Линейная запись (4M T6 Q16) | 99.3 % | 38.4 % |
|
||||
| Чтение 4k T12 Q128 | 95.3 % | 111.7 % |
|
||||
| Запись 4k T12 Q128 | 173 % | 194 % |
|
||||
|
||||
Потребление CPU на 1 клиента (fio):
|
||||
|
||||
| | TCP | RDMA |
|
||||
|------------------------------|--------|--------|
|
||||
| Линейное чтение (4M T6 Q16) | 99.9 % | 85.8 % |
|
||||
| Линейная запись (4M T6 Q16) | 38.9 % | 38.1 % |
|
||||
| Чтение 4k T12 Q128 | 100 % | 96.1 % |
|
||||
| Запись 4k T12 Q128 | 51.6 % | 41.9 % |
|
||||
|
||||
## EC 2+1
|
||||
|
||||
| | TCP | RDMA |
|
||||
|------------------------------|--------------|--------------|
|
||||
| Линейное чтение (4M T6 Q16) | 10.07 ГБ/с | 11.43 ГБ/с |
|
||||
| Линейная запись (4M T6 Q16) | 7.74 ГБ/с | 8.32 ГБ/с |
|
||||
| Чтение 4k T1 Q1 | 7408 iops | 8891 iops |
|
||||
| Запись 4k T1 Q1 | 3525 iops | 4903 iops |
|
||||
| Чтение 4k T12 Q128 | 1216496 iops | 2552765 iops |
|
||||
| Запись 4k T12 Q128 | 278110 iops | 821261 iops |
|
||||
|
||||
Потребление CPU OSD на 1 диск:
|
||||
|
||||
| | TCP | RDMA |
|
||||
|------------------------------|---------|---------|
|
||||
| Линейное чтение (4M T6 Q16) | 68.6 % | 33.6 % |
|
||||
| Линейная запись (4M T6 Q16) | 108.3 % | 50.2 % |
|
||||
| Чтение 4k T12 Q128 | 138.1 % | 97.9 % |
|
||||
| Запись 4k T12 Q128 | 168.7 % | 188.5 % |
|
||||
|
||||
Потребление CPU на 1 клиента (fio):
|
||||
|
||||
| | TCP | RDMA |
|
||||
|------------------------------|--------|--------|
|
||||
| Линейное чтение (4M T6 Q16) | 88.2 % | 52.4 % |
|
||||
| Линейная запись (4M T6 Q16) | 51.8 % | 46.8 % |
|
||||
| Чтение 4k T12 Q128 | 99.7 % | 61.3 % |
|
||||
| Запись 4k T12 Q128 | 35.1 % | 31.3 % |
|
|
@ -0,0 +1,215 @@
|
|||
[Documentation](../../README.md#documentation) → Usage → Administration
|
||||
|
||||
-----
|
||||
|
||||
[Читать на русском](admin.ru.md)
|
||||
|
||||
# Administration
|
||||
|
||||
- [Pool states](#pool-states)
|
||||
- [PG states](#pg-states)
|
||||
- [Base PG states](#base-pg-states)
|
||||
- [Additional PG states](#additional-pg-states)
|
||||
- [Removing a healthy disk](#removing-a-healthy-disk)
|
||||
- [Removing a failed disk](#removing-a-failed-disk)
|
||||
- [Adding a disk](#adding-a-disk)
|
||||
- [Restoring from lost pool configuration](#restoring-from-lost-pool-configuration)
|
||||
- [Upgrading Vitastor](#upgrading-vitastor)
|
||||
- [OSD memory usage](#osd-memory-usage)
|
||||
|
||||
## Pool states
|
||||
|
||||
Pool is active — that is, fully available for client input/output — when all its PGs are
|
||||
'active' (maybe with some additional state flags).
|
||||
|
||||
If at least 1 PG is inactive, pool is also inactive and all clients suspend their I/O and
|
||||
wait until you fix the cluster. :-)
|
||||
|
||||
## PG states
|
||||
|
||||
PG states may be seen in [vitastor-cli status](cli.en.md#status) output.
|
||||
|
||||
PG state consists of exactly 1 base state and an arbitrary number of additional states.
|
||||
|
||||
### Base PG states
|
||||
|
||||
PG state always includes exactly 1 of the following base states:
|
||||
- **active** — PG is active and handles user I/O.
|
||||
- **incomplete** — Not enough OSDs are available to activate this PG. That is, more disks
|
||||
are lost than it's allowed by the pool's redundancy scheme. For example, if the pool has
|
||||
pg_size=3 and pg_minsize=1, part of the data may be written only to 1 OSD. If that exact
|
||||
OSD is lost, PG will become **incomplete**.
|
||||
- **offline** — PG isn't activated by any OSD at all. Either primary OSD isn't set for
|
||||
this PG at all (if the pool is just created), or an unavailable OSD is set as primary,
|
||||
or the primary OSD refuses to start this PG (for example, because of wrong block_size),
|
||||
or the PG is stopped by the monitor using `pause: true` flag in `/vitastor/config/pgs` in etcd.
|
||||
- **starting** — primary OSD has acquired PG lock in etcd, PG is starting.
|
||||
- **peering** — primary OSD requests PG object listings from secondary OSDs and calculates
|
||||
the PG state.
|
||||
- **repeering** — PG is waiting for current I/O operations to complete and will
|
||||
then transition to **peering**.
|
||||
- **stopping** — PG is waiting for current I/O operations to complete and will
|
||||
then transition to **offline** or be activated by another OSD.
|
||||
|
||||
All states except **active** mean that PG is inactive and client I/O is suspended.
|
||||
|
||||
**peering** state is normally visible only for a short period of time during OSD restarts
|
||||
and during switching primary OSD of PGs.
|
||||
|
||||
**starting**, **repeering**, **stopping** states normally almost aren't visible at all.
|
||||
If you notice them for any noticeable time — chances are some operations on some OSDs hung.
|
||||
Search for "slow op" in OSD logs to find them — operations hung for more than
|
||||
[slow_log_interval](../config/osd.en.md#slow_log_interval) are logged as "slow ops".
|
||||
|
||||
State transition diagram:
|
||||
|
||||
![PG state transitions](pg_states.svg "PG state transitions")
|
||||
|
||||
### Additional PG states
|
||||
|
||||
If a PG is active it can also have any number of the following additional states:
|
||||
|
||||
- **degraded** — PG is running on reduced number of drives (OSDs), redundancy of all
|
||||
objects in this PG is reduced.
|
||||
- **has_incomplete** — some objects in this PG are incomplete (unrecoverable), that is,
|
||||
they have too many lost EC parts (more than pool's [parity_chunks](../config/pool.en.md#parity_chunks)).
|
||||
- **has_degraded** — some objects in this PG have reduced redundancy
|
||||
compared to the rest of the PG (so PG can be degraded+has_degraded at the same time).
|
||||
These objects should be healed automatically by recovery process, unless
|
||||
it's disabled by [no_recovery](../config/osd.en.md#no_recovery).
|
||||
- **has_misplaced** — some objects in this PG are stored on an OSD set different from
|
||||
the target set of the PG. These objects should be moved automatically, unless
|
||||
rebalance is disabled by [no_rebalance](../config/osd.en.md#no_rebalance). Objects
|
||||
that are degraded and misplaced at the same time are treated as just degraded.
|
||||
- **has_unclean** — one more state normally noticeable only for very short time during
|
||||
PG activation. It's used only with EC pools and means that some objects of this PG
|
||||
have started but not finished modifications. All such objects are either quickly
|
||||
committed or rolled back by the primary OSD when starting the PG, that is why the
|
||||
state shouldn't be noticeable. If you notice it, it probably means that commit or
|
||||
rollback operations are hung.
|
||||
- **has_invalid** — PG contains objects with incorrect part ID. Never occurs normally.
|
||||
It can only occur if you delete a non-empty EC pool and then recreate it as a replica
|
||||
pool or with smaller data part count.
|
||||
- **has_corrupted** — PG has corrupted objects, discovered by checking checksums during
|
||||
read or during scrub. When possible, such objects should be recovered automatically.
|
||||
If objects remain corrupted, use [vitastor-cli describe](cli.en.md#describe) to find
|
||||
out details and/or look into the log of the primary OSD of the PG.
|
||||
- **has_inconsistent** — PG has objects with non-matching parts or copies on different OSDs,
|
||||
and it's impossible to determine which copy is correct automatically. It may happen
|
||||
if you use a pool with 2 replica and you don't enable checksums, and if data on one
|
||||
of replicas becomes corrupted. You should also use vitastor-cli [describe](cli.en.md#describe)
|
||||
and [fix](cli.en.md#fix) commands to remove the incorrect version in this case.
|
||||
- **left_on_dead** — part of the data of this PG is left on unavailable OSD that isn't
|
||||
fully removed from the cluster. You should either start the corresponding OSD back and
|
||||
let it remove the unneeded data or remove it from cluster using vitastor-cli
|
||||
[rm-osd](cli.en.md#rm-osd) if you know that it's gone forever (for example, if the disk died).
|
||||
- **scrubbing** — data [scrub](../config/osd.en.md#auto_scrub) is running for this PG.
|
||||
|
||||
## Removing a healthy disk
|
||||
|
||||
Befor removing a healthy disk from the cluster set its OSD weight(s) to 0 to
|
||||
move data away. To do that, add `"reweight":0` to etcd key `/vitastor/config/osd/<OSD_NUMBER>`.
|
||||
For example:
|
||||
|
||||
```
|
||||
etcdctl --endpoints=http://1.1.1.1:2379/v3 put /vitastor/config/osd/1 '{"reweight":0}'
|
||||
```
|
||||
|
||||
Then wait until rebalance finishes and remove OSD by running `vitastor-disk purge /dev/vitastor/osdN-data`.
|
||||
|
||||
## Removing a failed disk
|
||||
|
||||
If a disk is already dead, its OSD(s) are likely already stopped.
|
||||
|
||||
In this case just remove OSD(s) from the cluster by running `vitastor-cli rm-osd OSD_NUMBER`.
|
||||
|
||||
## Adding a disk
|
||||
|
||||
If you're adding a server, first install Vitastor packages and copy the
|
||||
`/etc/vitastor/vitastor.conf` configuration file to it.
|
||||
|
||||
After that you can just run `vitastor-disk prepare /dev/nvmeXXX`, of course with
|
||||
the same parameters which you used for other OSDs in your cluster before.
|
||||
|
||||
## Restoring from lost pool configuration
|
||||
|
||||
If you remove or corrupt `/vitastor/config/pools` key in etcd all pools will
|
||||
be deleted. Don't worry, the data won't be lost, but you'll need to perform
|
||||
a specific recovery procedure.
|
||||
|
||||
First you need to restore previous configuration of the pool with the same ID
|
||||
and EC/replica parameters and wait until pool PGs appear in `vitastor-cli status`.
|
||||
|
||||
Then add all OSDs into the history records of all PGs. You can do it by running
|
||||
the following script (just don't forget to use your own PG_COUNT and POOL_ID):
|
||||
|
||||
```
|
||||
PG_COUNT=32
|
||||
POOL_ID=1
|
||||
ALL_OSDS=$(etcdctl --endpoints=your_etcd_address:2379 get --keys-only --prefix /vitastor/osd/stats/ | \
|
||||
perl -e '$/ = undef; $a = <>; $a =~ s/\s*$//; $a =~ s!/vitastor/osd/stats/!!g; $a =~ s/\s+/,/g; print $a')
|
||||
for i in $(seq 1 $PG_COUNT); do
|
||||
etcdctl --endpoints=your_etcd_address:2379 put /vitastor/pg/history/$POOL_ID/$i '{"all_peers":['$ALL_OSDS']}'; done
|
||||
done
|
||||
```
|
||||
|
||||
After that all PGs should peer and find all previous data.
|
||||
|
||||
## Upgrading Vitastor
|
||||
|
||||
Every upcoming Vitastor version is usually compatible with previous both forward
|
||||
and backward regarding the network protocol and etcd data structures.
|
||||
|
||||
So, by default, if this page doesn't contain explicit different instructions, you
|
||||
can upgrade your Vitastor cluster by simply upgrading packages and restarting all
|
||||
OSDs and monitors in any order.
|
||||
|
||||
Upgrading is performed without stopping clients (VMs/containers), you just need to
|
||||
upgrade and restart servers one by one. However, ideally you should restart VMs too
|
||||
to make them use the new version of the client library.
|
||||
|
||||
Exceptions (specific upgrade instructions):
|
||||
- Upgrading <= 1.1.x to 1.2.0 or later, if you use EC n+k with k>=2, is recommended
|
||||
to be performed with full downtime: first you should stop all clients, then all OSDs,
|
||||
then upgrade and start everything back — because versions before 1.2.0 have several
|
||||
bugs leading to invalid data being read in EC n+k, k>=2 configurations in degraded pools.
|
||||
- Versions <= 0.8.7 are incompatible with versions >= 0.9.0, so you should first
|
||||
upgrade from <= 0.8.7 to 0.8.8 or 0.8.9, and only then to >= 0.9.x. If you upgrade
|
||||
without this intermediate step, client I/O will hang until the end of upgrade process.
|
||||
- Upgrading from <= 0.5.x to >= 0.6.x is not supported.
|
||||
|
||||
Rollback:
|
||||
- Version 1.0.0 has a new disk format, so OSDs initiaziled on 1.0.0 can't be rolled
|
||||
back to 0.9.x or previous versions.
|
||||
- Versions before 0.8.0 don't have vitastor-disk, so OSDs, initialized by it, won't
|
||||
start with 0.7.x or 0.6.x. :-)
|
||||
|
||||
## OSD memory usage
|
||||
|
||||
OSD uses RAM mainly for:
|
||||
|
||||
- Metadata index: `data_size`/[`block_size`](../config/layout-cluster.en.md#block_size) * `approximately 1.1` * `32` bytes.
|
||||
Consumed always.
|
||||
- Copy of the on-disk metadata area: `data_size`/[`block_size`](../config/layout-cluster.en.md#block_size) * `28` bytes.
|
||||
Consumed if [inmemory_metadata](../config/osd.en.md#inmemory_metadata) isn't disabled.
|
||||
- Bitmaps: `data_size`/[`bitmap_granularity`](../config/layout-cluster.en.md#bitmap_granularity)/`8` * `2` bytes.
|
||||
Consumed always.
|
||||
- Journal index: between 0 and, approximately, journal size. Consumed always.
|
||||
- Copy of the on-disk journal area: exactly journal size. Consumed if
|
||||
[inmemory_journal](../config/osd.en.md#inmemory_journal) isn't disabled.
|
||||
- Checksums: `data_size`/[`csum_block_size`](../config/osd.en.md#csum_block_size) * 4 bytes.
|
||||
Consumed if checksums are enabled and [inmemory_metadata](../config/osd.en.md#inmemory_metadata) isn't disabled.
|
||||
|
||||
bitmap_granularity is almost always 4 KB.
|
||||
|
||||
So with default SSD settings (block_size=128k, journal_size=32M, csum_block_size=4k) memory usage is:
|
||||
|
||||
- Metadata and bitmaps: ~600 MB per 1 TB of data.
|
||||
- Journal: up to 64 MB per 1 OSD.
|
||||
- Checksums: 1 GB per 1 TB of data.
|
||||
|
||||
With default HDD settings (block_size=1M, journal_size=128M, csum_block_size=32k):
|
||||
|
||||
- Metadata and bitmaps: ~128 MB per 1 TB of data.
|
||||
- Journal: up to 256 MB per 1 OSD.
|
||||
- Checksums: 128 MB per 1 TB of data.
|
|
@ -0,0 +1,211 @@
|
|||
[Документация](../../README-ru.md#документация) → Использование → Администрирование
|
||||
|
||||
-----
|
||||
|
||||
[Read in English](admin.en.md)
|
||||
|
||||
# Администрирование
|
||||
|
||||
- [Состояния пулов](#состояния-пулов)
|
||||
- [Состояния PG](#состояния-pg)
|
||||
- [Базовые состояния PG](#базовые-состояния-pg)
|
||||
- [Дополнительные состояния PG](#дополнительные-состояния-pg)
|
||||
- [Удаление исправного диска](#удаление-исправного-диска)
|
||||
- [Удаление неисправного диска](#удаление-неисправного-диска)
|
||||
- [Добавление диска](#добавление-диска)
|
||||
- [Восстановление потерянной конфигурации пулов](#восстановление-потерянной-конфигурации-пулов)
|
||||
- [Обновление Vitastor](#обновление-vitastor)
|
||||
- [Потребление памяти OSD](#потребление-памяти-osd)
|
||||
|
||||
## Состояния пулов
|
||||
|
||||
Пул активен — то есть, полностью доступен для клиентского ввода-вывода — когда все его PG
|
||||
активны, то есть, имеют статус active, возможно, с любым набором дополнительных флагов.
|
||||
|
||||
Если хотя бы 1 PG неактивна, пул неактивен и все клиенты зависают и ждут, пока вы почините
|
||||
кластер. :-)
|
||||
|
||||
## Состояния PG
|
||||
|
||||
Вы можете видеть состояния PG в выводе команды [vitastor-cli status](cli.ru.md#status).
|
||||
|
||||
Состояние PG состоит из ровно 1 базового флага состояния, плюс любого числа дополнительных.
|
||||
|
||||
### Базовые состояния PG
|
||||
|
||||
Состояние PG включает в себя ровно 1 флаг из следующих:
|
||||
- **active** — PG активна и обрабатывает запросы ввода-вывода от пользователей.
|
||||
- **incomplete** — Недостаточно живых OSD, чтобы включить эту PG.
|
||||
То есть, дисков потеряно больше, чем разрешено схемой отказоустойчивости пула и pg_minsize.
|
||||
Например, если у пула pg_size=3 и pg_minsize=1, то часть данных может записаться всего на 1 OSD.
|
||||
Если потом конкретно этот OSD упадёт, PG окажется **incomplete**.
|
||||
- **offline** — PG вообще не активирована ни одним OSD. Либо первичный OSD не назначен вообще
|
||||
(если пул только создан), либо в качестве первичного назначен недоступный OSD, либо
|
||||
назначенный OSD отказывается запускать эту PG (например, из-за несовпадения block_size),
|
||||
либо PG остановлена монитором через флаг `pause: true` в `/vitastor/config/pgs` в etcd.
|
||||
- **starting** — первичный OSD захватил блокировку PG в etcd, PG запускается.
|
||||
- **peering** — первичный OSD опрашивает вторичные OSD на предмет списков объектов данной PG и рассчитывает её состояние.
|
||||
- **repeering** — PG ожидает завершения текущих операций ввода-вывода, после чего перейдёт в состояние **peering**.
|
||||
- **stopping** — PG ожидает завершения текущих операций ввода-вывода, после чего перейдёт в состояние **offline** или поднимется на другом OSD.
|
||||
|
||||
Все состояния, кроме **active**, означают, что PG неактивна и ввод-вывод приостановлен.
|
||||
|
||||
Состояние **peering** в норме заметно только при перезапуске OSD или переключении первичных
|
||||
OSD, на протяжении небольшого периода времени.
|
||||
|
||||
Состояния **starting**, **repeering**, **stopping** в норме практически не заметны вообще,
|
||||
PG должны очень быстро переходить из них в другие. Если эти состояния заметны
|
||||
хоть сколько-то значительное время — вероятно, какие-то операции на каких-то OSD зависли.
|
||||
Чтобы найти их, ищите "slow op" в журналах OSD — операции, зависшие дольше,
|
||||
чем на [slow_log_interval](../config/osd.ru.md#slow_log_interval), записываются в
|
||||
журналы OSD как "slow op".
|
||||
|
||||
Диаграмма переходов:
|
||||
|
||||
![Диаграмма переходов](pg_states.svg "Диаграмма переходов")
|
||||
|
||||
### Дополнительные состояния PG
|
||||
|
||||
Если PG активна, она также может иметь любое число дополнительных флагов состояний:
|
||||
|
||||
- **degraded** — PG поднята на неполном числе дисков (OSD), избыточность хранения всех объектов снижена.
|
||||
- **has_incomplete** — часть объектов в PG неполные (невосстановимые), то есть, у них потеряно
|
||||
слишком много EC-частей (больше, чем [parity_chunks](../config/pool.ru.md#parity_chunks) пула).
|
||||
- **has_degraded** — часть объектов в PG деградированы, избыточность их хранения снижена по сравнению
|
||||
с остальным содержимым данной PG (то есть, PG может одновременно быть degraded+has_degraded).
|
||||
Данные объекты должны восстановиться автоматически, если только восстановление не отключено
|
||||
через [no_recovery](../config/osd.ru.md#no_recovery).
|
||||
- **has_misplaced** — часть объектов в PG сейчас расположена не на целевом наборе OSD этой PG.
|
||||
Данные объекты должны переместиться автоматически, если только перебалансировка не отключена
|
||||
через [no_rebalance](../config/osd.ru.md#no_rebalance). Объекты, являющиеся одновременно
|
||||
degraded и misplaced, считаются просто degraded.
|
||||
- **has_unclean** — ещё одно состояние, в норме заметное только очень короткое время при поднятии PG.
|
||||
Применяется только к EC и означает, что на каких-то OSD этой PG есть EC-части объектов, для которых
|
||||
был начат, но не завершён процесс записи. Все такие объекты первичный OSD либо завершает, либо
|
||||
откатывает при поднятии PG первым делом, поэтому состояние и не должно быть заметно. Опять-таки,
|
||||
если оно заметно — значит, скорее всего, операции отката или завершения записи на каких-то OSD зависли.
|
||||
- **has_invalid** — в PG найдены объекты с некорректными ID части. В норме не проявляется вообще
|
||||
никогда, проявляется только если, не удалив данные, создать на месте EC-пула либо реплика-пул,
|
||||
либо EC-пул с меньшим числом частей данных.
|
||||
- **has_corrupted** — в PG есть повреждённые объекты, обнаруженные с помощью контрольных сумм или
|
||||
скраба (сверки копий). Если объекты можно восстановить, они восстановятся автоматически. Если
|
||||
не восстанавливаются, используйте команду [vitastor-cli describe](cli.ru.md#describe) для
|
||||
выяснения деталей и/или смотрите в журнал первичного OSD данной PG.
|
||||
- **has_inconsistent** — в PG есть объекты, у которых не совпадают копии/части данных на разных OSD,
|
||||
и при этом автоматически определить, какая копия верная, а какая нет, невозможно. Такое может
|
||||
произойти, если вы используете 2 реплики, не включали контрольные суммы, и на одной из реплик
|
||||
данные повредились. В этом случае тоже надо использовать команды vitastor-cli [describe](cli.ru.md#describe)
|
||||
и [fix](cli.ru.md#fix) для удаления некорректной версии.
|
||||
- **left_on_dead** — часть данных PG осталась на отключённом, но не удалённом из кластера окончательно,
|
||||
OSD. Вам нужно либо вернуть соответствующий OSD в строй и дать ему очистить лишние данные, либо
|
||||
удалить его из кластера окончательно с помощью vitastor-cli [rm-osd](cli.ru.md#rm-osd), если
|
||||
известно, что он уже не вернётся (например, если умер диск).
|
||||
- **scrubbing** — идёт фоновая проверка данных PG ([скраб](../config/osd.ru.md#auto_scrub)).
|
||||
|
||||
## Удаление исправного диска
|
||||
|
||||
Перед удалением исправного диска из кластера установите его OSD вес в 0, чтобы убрать с него данные.
|
||||
Для этого добавьте в ключ `/vitastor/config/osd/<НОМЕР_OSD>` в etcd значение `"reweight":0`, например:
|
||||
|
||||
```
|
||||
etcdctl --endpoints=http://1.1.1.1:2379/v3 put /vitastor/config/osd/1 '{"reweight":0}'
|
||||
```
|
||||
|
||||
Дождитесь завершения ребаланса, после чего удалите OSD командой `vitastor-disk purge /dev/vitastor/osdN-data`.
|
||||
|
||||
## Удаление неисправного диска
|
||||
|
||||
Если диск уже умер, его OSD, скорее всего, уже будет/будут остановлен(ы).
|
||||
|
||||
В этом случае просто удалите OSD из etcd командой `vitastor-cli rm-osd НОМЕР_OSD`.
|
||||
|
||||
## Добавление диска
|
||||
|
||||
Если сервер новый, установите на него пакеты Vitastor и скопируйте файл конфигурации
|
||||
`/etc/vitastor/vitastor.conf`.
|
||||
|
||||
После этого достаточно выполнить команду `vitastor-disk prepare /dev/nvmeXXX`, разумеется,
|
||||
с параметрами, аналогичными другим OSD в вашем кластере.
|
||||
|
||||
## Восстановление потерянной конфигурации пулов
|
||||
|
||||
Если удалить или повредить ключ `/vitastor/config/pools` в etcd, все пулы будут удалены.
|
||||
Не волнуйтесь, данные потеряны не будут, но вам нужно будет провести специальную
|
||||
процедуру восстановления.
|
||||
|
||||
Сначала нужно будет восстановить конфигурацию пулов, создав пул с таким же ID и
|
||||
с такими же параметрами EC/реплик, и подождать, пока PG пула появятся в `vitastor-cli status`.
|
||||
|
||||
Далее нужно будет добавить все OSD в исторические записи всех PG. Примерно так
|
||||
(только подставьте свои PG_COUNT и POOL_ID):
|
||||
|
||||
```
|
||||
PG_COUNT=32
|
||||
POOL_ID=1
|
||||
ALL_OSDS=$(etcdctl --endpoints=your_etcd_address:2379 get --keys-only --prefix /vitastor/osd/stats/ | \
|
||||
perl -e '$/ = undef; $a = <>; $a =~ s/\s*$//; $a =~ s!/vitastor/osd/stats/!!g; $a =~ s/\s+/,/g; print $a')
|
||||
for i in $(seq 1 $PG_COUNT); do
|
||||
etcdctl --endpoints=your_etcd_address:2379 put /vitastor/pg/history/$POOL_ID/$i '{"all_peers":['$ALL_OSDS']}'; done
|
||||
done
|
||||
```
|
||||
|
||||
После этого все PG должны пройти peering и найти все предыдущие данные.
|
||||
|
||||
## Обновление Vitastor
|
||||
|
||||
Обычно каждая следующая версия Vitastor совместима с предыдущими и "вперёд", и "назад"
|
||||
с точки зрения сетевого протокола и структур данных в etcd.
|
||||
|
||||
Так что по умолчанию, если на данной странице не указано обратное, считается, что для
|
||||
обновления достаточно обновить пакеты и перезапустить все OSD и мониторы Vitastor в
|
||||
произвольном порядке.
|
||||
|
||||
Обновление производится без остановки клиентов (виртуальных машин/контейнеров), для этого
|
||||
достаточно обновлять серверы по одному. Однако, конечно, чтобы запущенные виртуальные машины
|
||||
начали использовать новую версию клиентской библиотеки, их тоже нужно перезапустить.
|
||||
|
||||
Исключения (особые указания при обновлении):
|
||||
- Обновляться с версий <= 1.1.x до версий >= 1.2.0, если вы используете EC n+k и k>=2,
|
||||
рекомендуется с временной остановкой кластера — сначала нужно остановить всех клиентов,
|
||||
потом все OSD, потом обновить и запустить всё обратно — из-за нескольких багов, которые
|
||||
могли приводить к некорректному чтению данных в деградированных EC-пулах.
|
||||
- Версии <= 0.8.7 несовместимы с версиями >= 0.9.0, поэтому при обновлении с <= 0.8.7
|
||||
нужно сначала обновиться до 0.8.8 или 0.8.9, а уже потом до любых версий >= 0.9.x.
|
||||
Иначе клиентский ввод-вывод зависнет до завершения обновления.
|
||||
- Обновление с версий 0.5.x и более ранних до 0.6.x и более поздних не поддерживается.
|
||||
|
||||
Откат:
|
||||
- В версии 1.0.0 поменялся дисковый формат, поэтому OSD, созданные на версии >= 1.0.0,
|
||||
нельзя откатить до версии 0.9.x и более ранних.
|
||||
- В версиях ранее 0.8.0 нет vitastor-disk, значит, созданные им OSD нельзя откатить
|
||||
до 0.7.x или 0.6.x. :-)
|
||||
|
||||
## Потребление памяти OSD
|
||||
|
||||
Основное потребление памяти складывается из:
|
||||
|
||||
- Индекс метаданных: `размер_данных`/[`block_size`](../config/layout-cluster.ru.md#block_size) * `примерно 1.1` * `32` байт.
|
||||
Потребляется всегда.
|
||||
- Копия дисковой области метаданных: `размер_данных`/[`block_size`](../config/layout-cluster.ru.md#block_size) * `28` байт.
|
||||
Потребляется, если не отключена настройка [inmemory_metadata](../config/osd.ru.md#inmemory_metadata).
|
||||
- Битмапы: `размер_данных`/[`bitmap_granularity`](../config/layout-cluster.ru.md#bitmap_granularity)/`8` * `2` байт.
|
||||
Потребляется всегда.
|
||||
- Индекс журнала: от 0 до, приблизительно, размера журнала. Потребляется всегда.
|
||||
- Копия дисковой области журнала: в точности размер журнала. Потребляется,
|
||||
если не отключена настройка [inmemory_journal](../config/osd.ru.md#inmemory_journal).
|
||||
- Контрольные суммы: `размер_данных`/[`csum_block_size`](../config/osd.ru.md#csum_block_size) * `4` байт.
|
||||
Потребляется, если включены контрольные суммы и не отключена настройка [inmemory_metadata](../config/osd.ru.md#inmemory_metadata).
|
||||
|
||||
bitmap_granularity, как правило, никогда не меняется и равен 4 килобайтам.
|
||||
|
||||
Таким образом, при SSD-настройках по умолчанию (block_size=128k, journal_size=32M, csum_block_size=4k) потребляется:
|
||||
|
||||
- Метаданные и битмапы: ~600 МБ на 1 ТБ данных
|
||||
- Журнал: до 64 МБ на 1 OSD
|
||||
- Контрольные суммы: 1 ГБ на 1 ТБ данных
|
||||
|
||||
При HDD-настройках по умолчанию (block_size=1M, journal_size=128M, csum_block_size=32k):
|
||||
|
||||
- Метаданные и битмапы: ~128 МБ на 1 ТБ данных
|
||||
- Журнал: до 256 МБ на 1 OSD
|
||||
- Контрольные суммы: 128 МБ на 1 ТБ данных
|
|
@ -186,11 +186,9 @@ Merge layer data without changing metadata. Merge `<from>`..`<to>` to `<target>`
|
|||
|
||||
## describe
|
||||
|
||||
`vitastor-cli describe [--osds <osds>] [--object-state <states>] [--pool <pool>]
|
||||
[--inode <ino>] [--min-inode <ino>] [--max-inode <ino>]
|
||||
[--min-offset <offset>] [--max-offset <offset>]`
|
||||
`vitastor-cli describe [OPTIONS]`
|
||||
|
||||
Describe unclean object locations in the cluster.
|
||||
Describe unclean object locations in the cluster. Options:
|
||||
|
||||
```
|
||||
--osds <osds>
|
||||
|
@ -200,6 +198,8 @@ Describe unclean object locations in the cluster.
|
|||
degraded, misplaced, incomplete, corrupted, inconsistent.
|
||||
--pool <pool name or number>
|
||||
Only list objects in the given pool.
|
||||
--pg <pg number>
|
||||
Only list objects in the given PG of the pool.
|
||||
--inode, --min-inode, --max-inode
|
||||
Restrict listing to specific inode numbers.
|
||||
--min-offset, --max-offset
|
||||
|
@ -269,6 +269,8 @@ Optional parameters:
|
|||
| `--block_size 128k` | Put pool only on OSDs with this data block size |
|
||||
| `--bitmap_granularity 4k` | Put pool only on OSDs with this logical sector size |
|
||||
| `--immediate_commit none` | Put pool only on OSDs with this or larger immediate_commit (none < small < all) |
|
||||
| `--level_placement <rules>` | Use additional failure domain rules (example: "dc=112233") |
|
||||
| `--raw_placement <rules>` | Specify raw PG generation rules ([details](../config/pool.en.md#raw_placement)) |
|
||||
| `--primary_affinity_tags tags` | Prefer to put primary copies on OSDs with all specified tags |
|
||||
| `--scrub_interval <time>` | Enable regular scrubbing for this pool. Format: number + unit s/m/h/d/M/y |
|
||||
| `--used_for_fs <name>` | Mark pool as used for VitastorFS with metadata in image <name> |
|
||||
|
|
|
@ -194,12 +194,10 @@ vitastor-cli snap-create [-p|--pool <id|name>] <image>@<snapshot>
|
|||
|
||||
## describe
|
||||
|
||||
`vitastor-cli describe [--osds <osds>] [--object-state <состояния>] [--pool <пул>]
|
||||
[--inode <номер>] [--min-inode <номер>] [--max-inode <номер>]
|
||||
[--min-offset <смещение>] [--max-offset <смещение>]`
|
||||
`vitastor-cli describe [ОПЦИИ]`
|
||||
|
||||
Описать состояние "грязных" объектов в кластере, то есть таких объектов, копии
|
||||
или части которых хранятся на наборе OSD, не равном целевому.
|
||||
или части которых хранятся на наборе OSD, не равном целевому. Опции:
|
||||
|
||||
```
|
||||
--osds <osds>
|
||||
|
@ -214,6 +212,8 @@ vitastor-cli snap-create [-p|--pool <id|name>] <image>@<snapshot>
|
|||
- inconsistent - неконсистентный, с неоднозначным расхождением копий/частей
|
||||
--pool <имя или ID пула>
|
||||
Перечислять только объекты из заданного пула.
|
||||
--pg <номер PG>
|
||||
Перечислять только объекты из заданной PG пула.
|
||||
--inode, --min-inode, --max-inode
|
||||
Перечислять только объекты из указанных номеров инодов (образов).
|
||||
--min-offset, --max-offset
|
||||
|
@ -286,6 +286,8 @@ vitastor-cli snap-create [-p|--pool <id|name>] <image>@<snapshot>
|
|||
| `--block_size 128k` | ...только OSD с данным размером блока |
|
||||
| `--bitmap_granularity 4k` | ...только OSD с данным размером логического сектора |
|
||||
| `--immediate_commit none` | ...только OSD с этим или большим immediate_commit (none < small < all) |
|
||||
| `--level_placement <rules>` | Задать правила дополнительных доменов отказа (пример: "dc=112233") |
|
||||
| `--raw_placement <rules>` | Задать низкоуровневые правила генерации PG ([детали](../config/pool.ru.md#raw_placement)) |
|
||||
| `--primary_affinity_tags tags` | Предпочитать OSD со всеми данными тегами для роли первичных |
|
||||
| `--scrub_interval <time>` | Включить скрабы с заданным интервалом времени (число + единица s/m/h/d/M/y) |
|
||||
| `--pg_stripe_size <number>` | Увеличить блок группировки объектов по PG |
|
||||
|
|
|
@ -88,7 +88,7 @@ Options (both modes):
|
|||
--block_size 1M/128k Set blockstore object size
|
||||
--bitmap_granularity 4k Set bitmap granularity
|
||||
--data_csum_type none Set data checksum type (crc32c or none)
|
||||
--csum_block_size 4k Set data checksum block size
|
||||
--csum_block_size 4k/32k Set data checksum block size (SSD/HDD default)
|
||||
--data_device_block 4k Override data device block size
|
||||
--meta_device_block 4k Override metadata device block size
|
||||
--journal_device_block 4k Override journal device block size
|
||||
|
|
|
@ -89,7 +89,7 @@ vitastor-disk - инструмент командной строки для уп
|
|||
--block_size 1M/128k Задать размер объекта хранилища
|
||||
--bitmap_granularity 4k Задать гранулярность битовых карт
|
||||
--data_csum_type none Задать тип контрольных сумм (crc32c или none)
|
||||
--csum_block_size 4k Задать размер блока расчёта контрольных сумм
|
||||
--csum_block_size 4k/32k Задать размер блока расчёта контрольных сумм (дефолт SSD/HDD)
|
||||
--data_device_block 4k Задать размер блока устройства данных
|
||||
--meta_device_block 4k Задать размер блока метаданных
|
||||
--journal_device_block 4k Задать размер блока журнала
|
||||
|
|
|
@ -15,12 +15,21 @@ See also [VDUSE](qemu.en.md#vduse) as a better alternative to NBD.
|
|||
|
||||
Vitastor Kubernetes CSI driver uses NBD when VDUSE is unavailable.
|
||||
|
||||
## Map image
|
||||
Supports the following commands:
|
||||
|
||||
- [map](#map)
|
||||
- [unmap](#unmap)
|
||||
- [ls](#ls)
|
||||
- [netlink-map](#netlink-map)
|
||||
- [netlink-unmap](#netlink-unmap)
|
||||
- [netlink-revive](#netlink-revive)
|
||||
|
||||
## map
|
||||
|
||||
To create a local block device for a Vitastor image run:
|
||||
|
||||
```
|
||||
vitastor-nbd map --image testimg
|
||||
vitastor-nbd map [/dev/nbdN] --image testimg
|
||||
```
|
||||
|
||||
It will output a block device name like /dev/nbd0 which you can then use as a normal disk.
|
||||
|
@ -29,25 +38,25 @@ You can also use `--pool <POOL> --inode <INODE> --size <SIZE>` instead of `--ima
|
|||
|
||||
vitastor-nbd supports all usual Vitastor configuration options like `--config_file <path_to_config>` plus NBD-specific:
|
||||
|
||||
* `--nbd_timeout 300` \
|
||||
Timeout for I/O operations in seconds after exceeding which the kernel stops
|
||||
the device. You can set it to 0 to disable the timeout, but beware that you
|
||||
won't be able to stop the device at all if vitastor-nbd process dies.
|
||||
* `--nbd_timeout 0` \
|
||||
Timeout for I/O operations in seconds after exceeding which the kernel stops the device.
|
||||
Before Linux 5.19, if nbd_timeout is 0, a dead NBD device can't be removed from
|
||||
the system at all without rebooting.
|
||||
* `--nbd_max_devices 64 --nbd_max_part 3` \
|
||||
Options for the `nbd` kernel module when modprobing it (`nbds_max` and `max_part`).
|
||||
note that maximum allowed (nbds_max)*(1+max_part) is 256.
|
||||
* `--logfile /path/to/log/file.txt` \
|
||||
Write log messages to the specified file instead of dropping them (in background mode)
|
||||
or printing them to the standard output (in foreground mode).
|
||||
* `--dev_num N` \
|
||||
Use the specified device /dev/nbdN instead of automatic selection.
|
||||
Use the specified device /dev/nbdN instead of automatic selection (alternative syntax
|
||||
to /dev/nbdN positional parameter).
|
||||
* `--foreground 1` \
|
||||
Stay in foreground, do not daemonize.
|
||||
|
||||
Note that `nbd_timeout`, `nbd_max_devices` and `nbd_max_part` options may also be specified
|
||||
in `/etc/vitastor/vitastor.conf` or in other configuration file specified with `--config_file`.
|
||||
|
||||
## Unmap image
|
||||
## unmap
|
||||
|
||||
To unmap the device run:
|
||||
|
||||
|
@ -55,12 +64,14 @@ To unmap the device run:
|
|||
vitastor-nbd unmap /dev/nbd0
|
||||
```
|
||||
|
||||
## List mapped images
|
||||
## ls
|
||||
|
||||
```
|
||||
vitastor-nbd ls [--json]
|
||||
```
|
||||
|
||||
List mapped images.
|
||||
|
||||
Example output (normal format):
|
||||
|
||||
```
|
||||
|
@ -78,3 +89,45 @@ Example output (JSON format):
|
|||
```
|
||||
{"/dev/nbd0": {"image": "bench", "pid": 584536}, "/dev/nbd1": {"image": "bench1", "pid": 584546}}
|
||||
```
|
||||
|
||||
## netlink-map
|
||||
|
||||
```
|
||||
vitastor-nbd netlink-map [/dev/nbdN] (--image <image> | --pool <pool> --inode <inode> --size <size in bytes>)
|
||||
```
|
||||
|
||||
On recent kernel versions it's also possinle to map NBD devices using netlink interface.
|
||||
|
||||
This is an experimental feature because it doesn't solve all issues of NBD. Differences from regular ioctl-based 'map':
|
||||
|
||||
1. netlink-map can create new `/dev/nbdN` devices (those not present in /dev/).
|
||||
2. netlink-mapped devices can be unmapped only using `netlink-unmap` command.
|
||||
3. netlink-mapped devices don't show up `ls` output (yet).
|
||||
4. Dead netlink-mapped devices can be 'revived' using `netlink-revive`.
|
||||
However, old I/O requests will hang forever if `nbd_timeout` is not specified.
|
||||
5. netlink-map supports additional options:
|
||||
|
||||
* `--nbd_conn_timeout 0` \
|
||||
Disconnect a dead device automatically after this number of seconds.
|
||||
* `--nbd_destroy_on_disconnect 1` \
|
||||
Delete the nbd device on disconnect.
|
||||
* `--nbd_disconnect_on_close 1` \
|
||||
Disconnect the nbd device on close by last opener.
|
||||
* `--nbd_ro 1` \
|
||||
Set device into read only mode.
|
||||
|
||||
## netlink-unmap
|
||||
|
||||
```
|
||||
vitastor-nbd netlink-unmap /dev/nbdN
|
||||
```
|
||||
|
||||
Unmap a device using netlink interface. Works with both netlink and ioctl mapped devices.
|
||||
|
||||
## netlink-revive
|
||||
|
||||
```
|
||||
vitastor-nbd netlink-revive /dev/nbdX (--image <image> | --pool <pool> --inode <inode> --size <size in bytes>)
|
||||
```
|
||||
|
||||
Restart a dead NBD netlink-mapped device without removing it. Supports the same options as `netlink-map`.
|
||||
|
|
|
@ -18,12 +18,21 @@ NBD немного снижает производительность из-за
|
|||
|
||||
CSI-драйвер Kubernetes Vitastor использует NBD, когда VDUSE недоступен.
|
||||
|
||||
## Подключить устройство
|
||||
Поддерживаются следующие команды:
|
||||
|
||||
- [map](#map)
|
||||
- [unmap](#unmap)
|
||||
- [ls](#ls)
|
||||
- [netlink-map](#netlink-map)
|
||||
- [netlink-unmap](#netlink-unmap)
|
||||
- [netlink-revive](#netlink-revive)
|
||||
|
||||
## map
|
||||
|
||||
Чтобы создать локальное блочное устройство для образа, выполните команду:
|
||||
|
||||
```
|
||||
vitastor-nbd map --image testimg
|
||||
vitastor-nbd map [/dev/nbdN] --image testimg
|
||||
```
|
||||
|
||||
Команда напечатает название блочного устройства вида /dev/nbd0, которое потом можно
|
||||
|
@ -35,16 +44,13 @@ vitastor-nbd map --image testimg
|
|||
vitastor-nbd поддерживает все обычные опции Vitastor, например, `--config_file <path_to_config>`,
|
||||
плюс специфичные для NBD:
|
||||
|
||||
* `--nbd_timeout 30` \
|
||||
* `--nbd_timeout 0` \
|
||||
Максимальное время выполнения любой операции чтения/записи в секундах, при
|
||||
превышении которого ядро остановит NBD-устройство. Вы можете установить опцию
|
||||
в 0, чтобы отключить ограничение времени, но имейте в виду, что в этом случае
|
||||
вы вообще не сможете отключить NBD-устройство при нештатном завершении процесса
|
||||
vitastor-nbd.
|
||||
превышении которого ядро остановит NBD-устройство. На ядрах Linux старее 5.19,
|
||||
если таймаут установлен в 0, NBD-устройство вообще невозможно отключить из системы
|
||||
при нештатном завершении процесса.
|
||||
* `--nbd_max_devices 64 --nbd_max_part 3` \
|
||||
Опции, передаваемые модулю ядра nbd, если его загружает vitastor-nbd
|
||||
(`nbds_max` и `max_part`). Имейте в виду, что (nbds_max)*(1+max_part)
|
||||
обычно не должно превышать 256.
|
||||
Опции, передаваемые модулю ядра nbd, если его загружает vitastor-nbd (`nbds_max` и `max_part`).
|
||||
* `--logfile /path/to/log/file.txt` \
|
||||
Писать сообщения о процессе работы в заданный файл, вместо пропуска их
|
||||
при фоновом режиме запуска или печати на стандартный вывод при запуске
|
||||
|
@ -58,7 +64,7 @@ vitastor-nbd поддерживает все обычные опции Vitastor,
|
|||
также задавать в `/etc/vitastor/vitastor.conf` или в другом файле конфигурации,
|
||||
заданном опцией `--config_file`.
|
||||
|
||||
## Отключить устройство
|
||||
## unmap
|
||||
|
||||
Для отключения устройства выполните:
|
||||
|
||||
|
@ -66,12 +72,14 @@ vitastor-nbd поддерживает все обычные опции Vitastor,
|
|||
vitastor-nbd unmap /dev/nbd0
|
||||
```
|
||||
|
||||
## Вывести подключённые устройства
|
||||
## ls
|
||||
|
||||
```
|
||||
vitastor-nbd ls [--json]
|
||||
```
|
||||
|
||||
Вывести подключённые устройства.
|
||||
|
||||
Пример вывода в обычном формате:
|
||||
|
||||
```
|
||||
|
@ -89,3 +97,46 @@ pid: 584546
|
|||
```
|
||||
{"/dev/nbd0": {"image": "bench", "pid": 584536}, "/dev/nbd1": {"image": "bench1", "pid": 584546}}
|
||||
```
|
||||
|
||||
## netlink-map
|
||||
|
||||
```
|
||||
vitastor-nbd netlink-map [/dev/nbdN] (--image <image> | --pool <POOL> --inode <INODE> --size <SIZE>)
|
||||
```
|
||||
|
||||
На свежих версиях ядра Linux также возможно подключать NBD-устройства через интерфейс netlink.
|
||||
|
||||
Это экспериментальная функция, так как она не решает всех проблем NBD. Отличия от обычного 'map':
|
||||
|
||||
1. Можно создавать новые `/dev/nbdN` устройства (отсутствующие в /dev/).
|
||||
2. Отключать netlink-устройства можно только командой `netlink-unmap`.
|
||||
3. netlink-устройства не видно в выводе `ls` (пока что).
|
||||
4. Мёртвые netlink-устройства можно "оживить" командой `netlink-revive`. Правда, предыдущие
|
||||
запросы ввода-вывода всё равно зависнут навсегда, если `nbd_timeout` не задан.
|
||||
5. Поддерживаются дополнительные опции:
|
||||
|
||||
* `--nbd_conn_timeout 0` \
|
||||
Отключать мёртвое устройство автоматически через данное число секунд.
|
||||
* `--nbd_destroy_on_disconnect 1` \
|
||||
Удалять NBD-устройство при отключении.
|
||||
* `--nbd_disconnect_on_close 1` \
|
||||
Отключать NBD-устройство автоматически, когда его все закроют.
|
||||
* `--nbd_ro 1` \
|
||||
Установить для NBD-устройства режим "только для чтения".
|
||||
|
||||
## netlink-unmap
|
||||
|
||||
```
|
||||
vitastor-nbd netlink-unmap /dev/nbdN
|
||||
```
|
||||
|
||||
Отключить устройство через интерфейс netlink. Работает и с обычными, и с netlink-устройствами.
|
||||
|
||||
## netlink-revive
|
||||
|
||||
```
|
||||
vitastor-nbd netlink-revive /dev/nbdX (--image <image> | --pool <pool> --inode <inode> --size <size in bytes>)
|
||||
```
|
||||
|
||||
Оживить мёртвое NBD-устройство, ранее подключённое через netlink, без удаления. Поддерживает
|
||||
те же опции, что и `netlink-map`.
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
[Documentation](../../README.md#documentation) → Usage → NFS
|
||||
[Documentation](../../README.md#documentation) → Usage → VitastorFS and pseudo-FS
|
||||
|
||||
-----
|
||||
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
[Документация](../../README-ru.md#документация) → Использование → NFS
|
||||
[Документация](../../README-ru.md#документация) → Использование → VitastorFS и псевдо-ФС
|
||||
|
||||
-----
|
||||
|
||||
|
|
|
@ -0,0 +1,13 @@
|
|||
digraph G {
|
||||
rankdir=LR;
|
||||
bgcolor=transparent;
|
||||
edge [color="#00A000"];
|
||||
node [shape=hexagon, fillcolor="#A0A000", fontcolor=white, fontname="sans-serif", fontsize=12, style=filled, penwidth=0];
|
||||
offline -> starting -> peering -> offline;
|
||||
stopping -> offline;
|
||||
starting -> incomplete -> offline;
|
||||
active -> repeering -> peering -> active -> stopping;
|
||||
offline [fillcolor="#A00000"];
|
||||
incomplete [fillcolor="#A00000"];
|
||||
active [fillcolor="#00A000"];
|
||||
}
|
|
@ -0,0 +1,114 @@
|
|||
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
|
||||
<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN"
|
||||
"http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
|
||||
<!-- Generated by graphviz version 2.43.0 (0)
|
||||
-->
|
||||
<!-- Title: G Pages: 1 -->
|
||||
<svg width="603pt" height="123pt"
|
||||
viewBox="0.00 0.00 602.66 122.55" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
|
||||
<g id="graph0" class="graph" transform="scale(1 1) rotate(0) translate(4 118.55)">
|
||||
<title>G</title>
|
||||
<!-- offline -->
|
||||
<g id="node1" class="node">
|
||||
<title>offline</title>
|
||||
<polygon fill="#a00000" stroke="black" stroke-width="0" points="75.52,-56 56.6,-74 18.75,-74 -0.17,-56 18.75,-38 56.6,-38 75.52,-56"/>
|
||||
<text text-anchor="middle" x="37.67" y="-52.9" font-family="sans-serif" font-size="12.00" fill="white">offline</text>
|
||||
</g>
|
||||
<!-- starting -->
|
||||
<g id="node2" class="node">
|
||||
<title>starting</title>
|
||||
<polygon fill="#a0a000" stroke="black" stroke-width="0" points="199.56,-79 177.49,-97 133.35,-97 111.28,-79 133.35,-61 177.49,-61 199.56,-79"/>
|
||||
<text text-anchor="middle" x="155.42" y="-75.9" font-family="sans-serif" font-size="12.00" fill="white">starting</text>
|
||||
</g>
|
||||
<!-- offline->starting -->
|
||||
<g id="edge1" class="edge">
|
||||
<title>offline->starting</title>
|
||||
<path fill="none" stroke="#00a000" d="M69.39,-62.1C81.66,-64.54 96.04,-67.4 109.45,-70.06"/>
|
||||
<polygon fill="#00a000" stroke="#00a000" points="108.98,-73.54 119.47,-72.05 110.34,-66.67 108.98,-73.54"/>
|
||||
</g>
|
||||
<!-- peering -->
|
||||
<g id="node3" class="node">
|
||||
<title>peering</title>
|
||||
<polygon fill="#a0a000" stroke="black" stroke-width="0" points="335.57,-95 313.96,-113 270.74,-113 249.13,-95 270.74,-77 313.96,-77 335.57,-95"/>
|
||||
<text text-anchor="middle" x="292.35" y="-91.9" font-family="sans-serif" font-size="12.00" fill="white">peering</text>
|
||||
</g>
|
||||
<!-- starting->peering -->
|
||||
<g id="edge2" class="edge">
|
||||
<title>starting->peering</title>
|
||||
<path fill="none" stroke="#00a000" d="M194.36,-83.5C209.71,-85.32 227.6,-87.44 243.8,-89.36"/>
|
||||
<polygon fill="#00a000" stroke="#00a000" points="243.82,-92.89 254.16,-90.59 244.64,-85.94 243.82,-92.89"/>
|
||||
</g>
|
||||
<!-- incomplete -->
|
||||
<g id="node5" class="node">
|
||||
<title>incomplete</title>
|
||||
<polygon fill="#a00000" stroke="black" stroke-width="0" points="349.09,-41 320.72,-59 263.99,-59 235.62,-41 263.99,-23 320.72,-23 349.09,-41"/>
|
||||
<text text-anchor="middle" x="292.35" y="-37.9" font-family="sans-serif" font-size="12.00" fill="white">incomplete</text>
|
||||
</g>
|
||||
<!-- starting->incomplete -->
|
||||
<g id="edge5" class="edge">
|
||||
<title>starting->incomplete</title>
|
||||
<path fill="none" stroke="#00a000" d="M188.74,-69.9C204.92,-65.34 224.85,-59.73 242.82,-54.67"/>
|
||||
<polygon fill="#00a000" stroke="#00a000" points="243.9,-58 252.57,-51.92 242,-51.26 243.9,-58"/>
|
||||
</g>
|
||||
<!-- peering->offline -->
|
||||
<g id="edge3" class="edge">
|
||||
<title>peering->offline</title>
|
||||
<path fill="none" stroke="#00a000" d="M259.32,-103.69C222.67,-112.11 161.28,-121.52 111.35,-106 94.55,-100.78 78.2,-90.18 65.27,-80.08"/>
|
||||
<polygon fill="#00a000" stroke="#00a000" points="67.26,-77.19 57.3,-73.58 62.84,-82.61 67.26,-77.19"/>
|
||||
</g>
|
||||
<!-- active -->
|
||||
<g id="node6" class="node">
|
||||
<title>active</title>
|
||||
<polygon fill="#00a000" stroke="black" stroke-width="0" points="456.34,-49 438.55,-67 402.97,-67 385.18,-49 402.97,-31 438.55,-31 456.34,-49"/>
|
||||
<text text-anchor="middle" x="420.76" y="-45.9" font-family="sans-serif" font-size="12.00" fill="white">active</text>
|
||||
</g>
|
||||
<!-- peering->active -->
|
||||
<g id="edge9" class="edge">
|
||||
<title>peering->active</title>
|
||||
<path fill="none" stroke="#00a000" d="M322.99,-84.22C341.47,-77.49 365.34,-68.8 384.75,-61.74"/>
|
||||
<polygon fill="#00a000" stroke="#00a000" points="385.96,-65.03 394.16,-58.32 383.56,-58.45 385.96,-65.03"/>
|
||||
</g>
|
||||
<!-- stopping -->
|
||||
<g id="node4" class="node">
|
||||
<title>stopping</title>
|
||||
<polygon fill="#a0a000" stroke="black" stroke-width="0" points="591.65,-18 567.57,-36 519.39,-36 495.31,-18 519.39,0 567.57,0 591.65,-18"/>
|
||||
<text text-anchor="middle" x="543.48" y="-14.9" font-family="sans-serif" font-size="12.00" fill="white">stopping</text>
|
||||
</g>
|
||||
<!-- stopping->offline -->
|
||||
<g id="edge4" class="edge">
|
||||
<title>stopping->offline</title>
|
||||
<path fill="none" stroke="#00a000" d="M500.13,-14.3C440.78,-9.83 329.58,-4.07 235.49,-14 179.71,-19.89 116.5,-34.9 77.11,-45.29"/>
|
||||
<polygon fill="#00a000" stroke="#00a000" points="76.14,-41.92 67.38,-47.89 77.94,-48.69 76.14,-41.92"/>
|
||||
</g>
|
||||
<!-- incomplete->offline -->
|
||||
<g id="edge6" class="edge">
|
||||
<title>incomplete->offline</title>
|
||||
<path fill="none" stroke="#00a000" d="M240.25,-44.03C194.33,-46.76 127.57,-50.72 83.64,-53.33"/>
|
||||
<polygon fill="#00a000" stroke="#00a000" points="83.32,-49.84 73.54,-53.93 83.73,-56.83 83.32,-49.84"/>
|
||||
</g>
|
||||
<!-- active->stopping -->
|
||||
<g id="edge10" class="edge">
|
||||
<title>active->stopping</title>
|
||||
<path fill="none" stroke="#00a000" d="M449.46,-41.89C463.64,-38.25 481.26,-33.72 497.34,-29.59"/>
|
||||
<polygon fill="#00a000" stroke="#00a000" points="498.29,-32.96 507.11,-27.08 496.55,-26.18 498.29,-32.96"/>
|
||||
</g>
|
||||
<!-- repeering -->
|
||||
<g id="node7" class="node">
|
||||
<title>repeering</title>
|
||||
<polygon fill="#a0a000" stroke="black" stroke-width="0" points="594.84,-83 569.16,-101 517.8,-101 492.12,-83 517.8,-65 569.16,-65 594.84,-83"/>
|
||||
<text text-anchor="middle" x="543.48" y="-79.9" font-family="sans-serif" font-size="12.00" fill="white">repeering</text>
|
||||
</g>
|
||||
<!-- active->repeering -->
|
||||
<g id="edge7" class="edge">
|
||||
<title>active->repeering</title>
|
||||
<path fill="none" stroke="#00a000" d="M448.85,-56.63C462.9,-60.59 480.44,-65.53 496.53,-70.06"/>
|
||||
<polygon fill="#00a000" stroke="#00a000" points="495.74,-73.47 506.32,-72.82 497.64,-66.74 495.74,-73.47"/>
|
||||
</g>
|
||||
<!-- repeering->peering -->
|
||||
<g id="edge8" class="edge">
|
||||
<title>repeering->peering</title>
|
||||
<path fill="none" stroke="#00a000" d="M495.33,-85.27C451.99,-87.36 387.93,-90.44 343.63,-92.58"/>
|
||||
<polygon fill="#00a000" stroke="#00a000" points="343.2,-89.09 333.38,-93.07 343.54,-96.09 343.2,-89.09"/>
|
||||
</g>
|
||||
</g>
|
||||
</svg>
|
After Width: | Height: | Size: 5.9 KiB |
|
@ -0,0 +1,49 @@
|
|||
module.exports = {
|
||||
"env": {
|
||||
"es6": true,
|
||||
"node": true
|
||||
},
|
||||
"extends": [
|
||||
"eslint:recommended",
|
||||
"plugin:node/recommended"
|
||||
],
|
||||
"parserOptions": {
|
||||
"ecmaVersion": 2020
|
||||
},
|
||||
"plugins": [
|
||||
],
|
||||
"rules": {
|
||||
"indent": [
|
||||
"error",
|
||||
4
|
||||
],
|
||||
"brace-style": [
|
||||
"error",
|
||||
"allman",
|
||||
{ "allowSingleLine": true }
|
||||
],
|
||||
"linebreak-style": [
|
||||
"error",
|
||||
"unix"
|
||||
],
|
||||
"semi": [
|
||||
"error",
|
||||
"always"
|
||||
],
|
||||
"no-useless-escape": [
|
||||
"off"
|
||||
],
|
||||
"no-control-regex": [
|
||||
"off"
|
||||
],
|
||||
"no-empty": [
|
||||
"off"
|
||||
],
|
||||
"no-process-exit": [
|
||||
"off"
|
||||
],
|
||||
"node/shebang": [
|
||||
"off"
|
||||
]
|
||||
}
|
||||
};
|
|
@ -97,7 +97,6 @@ function scale_pg_history(prev_pg_history, prev_pgs, new_pgs)
|
|||
|
||||
function scale_pg_count(prev_pgs, new_pg_count)
|
||||
{
|
||||
const old_pg_count = prev_pgs.length;
|
||||
// Just for the lp_solve optimizer - pick a "previous" PG for each "new" one
|
||||
if (prev_pgs.length < new_pg_count)
|
||||
{
|
||||
|
|
|
@ -0,0 +1,409 @@
|
|||
const { select_murmur3 } = require('./murmur3.js');
|
||||
|
||||
const NO_OSD = 'Z';
|
||||
|
||||
class RuleCombinator
|
||||
{
|
||||
constructor(osd_tree, rules, max_combinations, ordered)
|
||||
{
|
||||
this.osd_tree = index_tree(Object.values(osd_tree).filter(o => o.id));
|
||||
this.rules = rules;
|
||||
this.max_combinations = max_combinations;
|
||||
this.ordered = ordered;
|
||||
}
|
||||
|
||||
random_combinations()
|
||||
{
|
||||
return random_custom_combinations(this.osd_tree, this.rules, this.max_combinations, this.ordered);
|
||||
}
|
||||
|
||||
check_combinations(pgs)
|
||||
{
|
||||
return check_custom_combinations(this.osd_tree, this.rules, pgs);
|
||||
}
|
||||
}
|
||||
|
||||
// Convert alternative "level-index" format to rules
|
||||
// level_index = { [level: string]: string | string[] }
|
||||
// level_sequence = optional, levels from upper to lower, i.e. [ 'dc', 'host' ]
|
||||
// Example: level_index = { dc: "112233", host: "ABCDEF" }
|
||||
function parse_level_indexes(level_index, level_sequence)
|
||||
{
|
||||
const rules = [];
|
||||
const lvl_first = {};
|
||||
for (const level in level_index)
|
||||
{
|
||||
const idx = level_index[level];
|
||||
while (rules.length < idx.length)
|
||||
{
|
||||
rules.push([]);
|
||||
}
|
||||
const seen = {};
|
||||
for (let i = 0; i < idx.length; i++)
|
||||
{
|
||||
if (!seen[idx[i]])
|
||||
{
|
||||
const other = Object.values(seen);
|
||||
if (other.length)
|
||||
{
|
||||
rules[i].push([ level, '!=', other ]);
|
||||
}
|
||||
seen[idx[i]] = i+1;
|
||||
}
|
||||
else
|
||||
{
|
||||
rules[i].push([ level, '=', seen[idx[i]] ]);
|
||||
}
|
||||
}
|
||||
lvl_first[level] = seen;
|
||||
}
|
||||
if (level_sequence)
|
||||
{
|
||||
// Prune useless rules for the sake of prettiness
|
||||
// For simplicity, call "upper" level DC and "lower" level host
|
||||
const level_prio = Object.keys(level_sequence).reduce((a, c) => { a[level_sequence[c]] = c; return a; }, {});
|
||||
for (let upper_i = 0; upper_i < level_sequence.length-1; upper_i++)
|
||||
{
|
||||
const upper_level = level_sequence[upper_i];
|
||||
for (let i = 0; i < rules.length; i++)
|
||||
{
|
||||
const noteq = {};
|
||||
for (let k = 0; k < level_index[upper_level].length; k++)
|
||||
{
|
||||
// If upper_level[x] is different from upper_level[y]
|
||||
// then lower_level[x] is also different from lower_level[y]
|
||||
if (level_index[upper_level][k] != level_index[upper_level][i])
|
||||
{
|
||||
noteq[k+1] = true;
|
||||
}
|
||||
}
|
||||
for (let j = 0; j < rules[i].length; j++)
|
||||
{
|
||||
if (level_prio[rules[i][j][0]] != null && level_prio[rules[i][j][0]] > upper_i && rules[i][j][1] == '!=')
|
||||
{
|
||||
rules[i][j][2] = rules[i][j][2].filter(other_host => !noteq[other_host]);
|
||||
if (!rules[i][j][2].length)
|
||||
{
|
||||
rules[i].splice(j--, 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return rules;
|
||||
}
|
||||
|
||||
// Parse rules in DSL format
|
||||
// dsl := item | item ("\n" | ",") items
|
||||
// item := "any" | rules
|
||||
// rules := rule | rule rules
|
||||
// rule := level operator arg
|
||||
// level := /\w+/
|
||||
// operator := "!=" | "=" | ">" | "?="
|
||||
// arg := value | "(" values ")"
|
||||
// values := value | value "," values
|
||||
// value := item_ref | constant_id
|
||||
// item_ref := /\d+/
|
||||
// constant_id := /"([^"]+)"/
|
||||
//
|
||||
// Output: [ level, operator, value ][][]
|
||||
function parse_pg_dsl(text)
|
||||
{
|
||||
const tokens = [ ...text.matchAll(/\w+|!=|\?=|[>=\(\),\n]|"([^\"]+)"/g) ].map(t => [ t[0], t.index ]);
|
||||
let positions = [ [] ];
|
||||
let rules = positions[0];
|
||||
for (let i = 0; i < tokens.length; )
|
||||
{
|
||||
if (tokens[i][0] === '\n' || tokens[i][0] === ',')
|
||||
{
|
||||
rules = [];
|
||||
positions.push(rules);
|
||||
i++;
|
||||
}
|
||||
else if (!rules.length && tokens[i][0] === 'any' && (i == tokens.length-1 || tokens[i+1][0] === ',' || tokens[i+1][0] === '\n'))
|
||||
{
|
||||
i++;
|
||||
}
|
||||
else
|
||||
{
|
||||
if (!/^\w/.exec(tokens[i][0]))
|
||||
{
|
||||
throw new Error('Unexpected '+tokens[i][0]+' at '+tokens[i][1]+' (level name expected)');
|
||||
}
|
||||
if (i > tokens.length-3)
|
||||
{
|
||||
throw new Error('Unexpected EOF (operator and value expected)');
|
||||
}
|
||||
if (/^\w/.exec(tokens[i+1][0]) || tokens[i+1][0] === ',' || tokens[i+1][0] === '\n')
|
||||
{
|
||||
throw new Error('Unexpected '+tokens[i+1][0]+' at '+tokens[i+1][1]+' (operator expected)');
|
||||
}
|
||||
if (!/^[\w"(]/.exec(tokens[i+2][0])) // "
|
||||
{
|
||||
throw new Error('Unexpected '+tokens[i+2][0]+' at '+tokens[i+2][1]+' (id, round brace, number or node ID expected)');
|
||||
}
|
||||
let rule = [ tokens[i][0], tokens[i+1][0], tokens[i+2][0] ];
|
||||
i += 3;
|
||||
if (rule[2][0] == '"')
|
||||
{
|
||||
rule[2] = { id: rule[2].substr(1, rule[2].length-2) };
|
||||
}
|
||||
else if (rule[2] === '(')
|
||||
{
|
||||
rule[2] = [];
|
||||
// eslint-disable-next-line no-constant-condition
|
||||
while (true)
|
||||
{
|
||||
if (i > tokens.length-1)
|
||||
{
|
||||
throw new Error('Unexpected EOF (expected list and a closing round brace)');
|
||||
}
|
||||
if (tokens[i][0] === ',')
|
||||
{
|
||||
i++;
|
||||
}
|
||||
else if (tokens[i][0] === ')')
|
||||
{
|
||||
i++;
|
||||
break;
|
||||
}
|
||||
else if (tokens[i][0][0] === '"')
|
||||
{
|
||||
rule[2].push({ id: tokens[i][0].substr(1, tokens[i][0].length-2) });
|
||||
i++;
|
||||
}
|
||||
else if (/^\d+$/.exec(tokens[i][0]))
|
||||
{
|
||||
const n = 0|tokens[i][0];
|
||||
if (!n)
|
||||
{
|
||||
throw new Error('Level reference cannot be 0 (refs count from 1) at '+tokens[i][1]);
|
||||
}
|
||||
else if (n > positions.length)
|
||||
{
|
||||
throw new Error('Forward references are forbidden at '+tokens[i][1]);
|
||||
}
|
||||
rule[2].push(n);
|
||||
i++;
|
||||
}
|
||||
else if (!/^\w/.exec(tokens[i][0]))
|
||||
{
|
||||
throw new Error('Unexpected '+tokens[i][0]+' at '+tokens[i][1]+' (number or node ID expected)');
|
||||
}
|
||||
else
|
||||
{
|
||||
rule[2].push({ id: tokens[i][0] });
|
||||
i++;
|
||||
}
|
||||
}
|
||||
}
|
||||
else if (!/^\d+$/.exec(rule[2]))
|
||||
{
|
||||
rule[2] = { id: rule[2] };
|
||||
}
|
||||
else
|
||||
{
|
||||
rule[2] = 0|rule[2];
|
||||
if (!rule[2])
|
||||
{
|
||||
throw new Error('Level reference cannot be 0 (refs count from 1) at '+tokens[i-1][1]);
|
||||
}
|
||||
else if (rule[2] > positions.length)
|
||||
{
|
||||
throw new Error('Forward references are forbidden at '+tokens[i-1][1]);
|
||||
}
|
||||
}
|
||||
rules.push(rule);
|
||||
}
|
||||
}
|
||||
return positions;
|
||||
}
|
||||
|
||||
// osd_tree = index_tree() output
|
||||
// levels = { string: number }
|
||||
// rules = [ level, operator, value ][][]
|
||||
// level = string
|
||||
// operator = '=' | '!=' | '>' | '?='
|
||||
// value = number|number[] | { id: string|string[] }
|
||||
// examples:
|
||||
// 1) simple 3 replicas with failure_domain=host:
|
||||
// [ [], [ [ 'host', '!=', 1 ] ], [ [ 'host', '!=', [ 1, 2 ] ] ] ]
|
||||
// in DSL form: any, host!=1, host!=(1,2)
|
||||
// 2) EC 4+2 in 3 DC:
|
||||
// [ [], [ [ 'dc', '=', 1 ], [ 'host', '!=', 1 ] ],
|
||||
// [ 'dc', '!=', 1 ], [ [ 'dc', '=', 3 ], [ 'host', '!=', 3 ] ],
|
||||
// [ 'dc', '!=', [ 1, 3 ] ], [ [ 'dc', '=', 5 ], [ 'host', '!=', 5 ] ] ]
|
||||
// in DSL form: any, dc=1 host!=1, dc!=1, dc=3 host!=3, dc!=(1,3), dc=5 host!=5
|
||||
// 3) 1 replica in fixed DC + 2 in random DCs:
|
||||
// [ [ [ 'dc', '=', { id: 'meow' } ] ], [ [ 'dc', '!=', 1 ] ], [ [ 'dc', '!=', [ 1, 2 ] ] ] ]
|
||||
// in DSL form: dc=meow, dc!=1, dc!=(1,2)
|
||||
// 4) 2 replicas in each DC (almost the same as (2)):
|
||||
// DSL: any, dc=1 host!=1, dc!=1, dc=3 host!=3
|
||||
// Alternative simpler way to specify rules would be: [ DC: 112233 HOST: 123456 ]
|
||||
function random_custom_combinations(osd_tree, rules, count, ordered)
|
||||
{
|
||||
const r = {};
|
||||
const first = filter_tree_by_rules(osd_tree, rules[0], []);
|
||||
let max_size = 0;
|
||||
// All combinations for the first item (usually "any") to try to include each OSD at least once
|
||||
for (const f of first)
|
||||
{
|
||||
const selected = [ f ];
|
||||
for (let i = 1; i < rules.length; i++)
|
||||
{
|
||||
const filtered = filter_tree_by_rules(osd_tree, rules[i], selected);
|
||||
const idx = select_murmur3(filtered.length, i => 'p:'+f.id+':'+filtered[i].id);
|
||||
selected.push(idx == null ? { levels: {}, id: null } : filtered[idx]);
|
||||
}
|
||||
const size = selected.filter(s => s.id !== null).length;
|
||||
max_size = max_size < size ? size : max_size;
|
||||
const pg = selected.map(s => s.id === null ? NO_OSD : (0|s.id));
|
||||
if (!ordered)
|
||||
pg.sort();
|
||||
r['pg_'+pg.join('_')] = pg;
|
||||
}
|
||||
// Pseudo-random selection
|
||||
for (let n = 0; n < count; n++)
|
||||
{
|
||||
const selected = [];
|
||||
for (const item_rules of rules)
|
||||
{
|
||||
const filtered = selected.length ? filter_tree_by_rules(osd_tree, item_rules, selected) : first;
|
||||
const idx = select_murmur3(filtered.length, i => n+':'+filtered[i].id);
|
||||
selected.push(idx == null ? { levels: {}, id: null } : filtered[idx]);
|
||||
}
|
||||
const size = selected.filter(s => s.id !== null).length;
|
||||
max_size = max_size < size ? size : max_size;
|
||||
const pg = selected.map(s => s.id === null ? NO_OSD : (0|s.id));
|
||||
if (!ordered)
|
||||
pg.sort();
|
||||
r['pg_'+pg.join('_')] = pg;
|
||||
}
|
||||
// Exclude PGs with less successful selections than maximum
|
||||
for (const k in r)
|
||||
{
|
||||
if (r[k].filter(s => s !== NO_OSD).length < max_size)
|
||||
{
|
||||
delete r[k];
|
||||
}
|
||||
}
|
||||
return r;
|
||||
}
|
||||
|
||||
function filter_tree_by_rules(osd_tree, rules, selected)
|
||||
{
|
||||
let cur = osd_tree[''].children;
|
||||
for (const rule of rules)
|
||||
{
|
||||
const val = (rule[2] instanceof Array ? rule[2] : [ rule[2] ])
|
||||
.map(v => v instanceof Object ? v.id : selected[v-1].levels[rule[0]]);
|
||||
let preferred = [], other = [];
|
||||
for (let i = 0; i < cur.length; i++)
|
||||
{
|
||||
const item = cur[i];
|
||||
const level_id = item.levels[rule[0]];
|
||||
if (level_id)
|
||||
{
|
||||
if (rule[1] == '>' && val.filter(v => level_id <= v).length == 0 ||
|
||||
(rule[1] == '=' || rule[1] == '?=') && val.filter(v => level_id != v).length == 0 ||
|
||||
rule[1] == '!=' && val.filter(v => level_id == v).length == 0)
|
||||
{
|
||||
// Include
|
||||
preferred.push(item);
|
||||
}
|
||||
else if (rule[1] == '?=' && val.filter(v => level_id != v).length > 0)
|
||||
{
|
||||
// Non-preferred
|
||||
other.push(item);
|
||||
}
|
||||
}
|
||||
else if (item.children)
|
||||
{
|
||||
// Descend
|
||||
cur.splice(i+1, 0, ...item.children);
|
||||
}
|
||||
}
|
||||
cur = preferred.length ? preferred : other;
|
||||
}
|
||||
// Get leaf items
|
||||
for (let i = 0; i < cur.length; i++)
|
||||
{
|
||||
if (cur[i].children)
|
||||
{
|
||||
// Descend
|
||||
cur.splice(i, 1, ...cur[i].children);
|
||||
i--;
|
||||
}
|
||||
}
|
||||
return cur;
|
||||
}
|
||||
|
||||
// Convert from
|
||||
// node_list = { id: string|number, level: string, size?: number, parent?: string|number }[]
|
||||
// to
|
||||
// node_tree = { [node_id]: { id, level, size?, parent?, children?: child_node_id[], levels: { [level]: id, ... } } }
|
||||
function index_tree(node_list)
|
||||
{
|
||||
const tree = { '': { children: [], levels: {} } };
|
||||
for (const node of node_list)
|
||||
{
|
||||
tree[node.id] = { ...node, levels: {} };
|
||||
delete tree[node.id].children;
|
||||
}
|
||||
for (const node of node_list)
|
||||
{
|
||||
const parent_id = node.parent && tree[node.parent] ? node.parent : '';
|
||||
tree[parent_id].children = tree[parent_id].children || [];
|
||||
tree[parent_id].children.push(tree[node.id]);
|
||||
}
|
||||
const cur = tree[''].children;
|
||||
for (let i = 0; i < cur.length; i++)
|
||||
{
|
||||
cur[i].levels[cur[i].level] = cur[i].id;
|
||||
if (cur[i].children)
|
||||
{
|
||||
for (const child of cur[i].children)
|
||||
{
|
||||
child.levels = { ...cur[i].levels, ...child.levels };
|
||||
}
|
||||
cur.splice(i, 1, ...cur[i].children);
|
||||
i--;
|
||||
}
|
||||
}
|
||||
return tree;
|
||||
}
|
||||
|
||||
// selection = id[]
|
||||
// osd_tree = index_tree output
|
||||
// rules = parse_pg_dsl output
|
||||
function check_custom_combinations(osd_tree, rules, pgs)
|
||||
{
|
||||
const res = [];
|
||||
skip_pg: for (const pg of pgs)
|
||||
{
|
||||
let selected = pg.map(id => osd_tree[id] || null);
|
||||
for (let i = 0; i < rules.length; i++)
|
||||
{
|
||||
const filtered = filter_tree_by_rules(osd_tree, rules[i], selected);
|
||||
if (selected[i] === null && filtered.length ||
|
||||
!filtered.filter(ok => selected[i].id === ok.id).length)
|
||||
{
|
||||
continue skip_pg;
|
||||
}
|
||||
}
|
||||
res.push(pg);
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
module.exports = {
|
||||
RuleCombinator,
|
||||
NO_OSD,
|
||||
|
||||
index_tree,
|
||||
parse_level_indexes,
|
||||
parse_pg_dsl,
|
||||
random_custom_combinations,
|
||||
check_custom_combinations,
|
||||
};
|
|
@ -50,15 +50,15 @@ async function lp_solve(text)
|
|||
return { score, vars };
|
||||
}
|
||||
|
||||
async function optimize_initial({ osd_tree, pg_count, pg_size = 3, pg_minsize = 2, max_combinations = 10000, parity_space = 1, ordered = false })
|
||||
// osd_weights = { [id]: weight }
|
||||
async function optimize_initial({ osd_weights, combinator, pg_count, pg_size = 3, pg_minsize = 2, parity_space = 1, ordered = false })
|
||||
{
|
||||
if (!pg_count || !osd_tree)
|
||||
if (!pg_count || !osd_weights)
|
||||
{
|
||||
return null;
|
||||
}
|
||||
const all_weights = Object.assign({}, ...Object.values(osd_tree));
|
||||
const total_weight = Object.values(all_weights).reduce((a, c) => Number(a) + Number(c), 0);
|
||||
const all_pgs = Object.values(random_combinations(osd_tree, pg_size, max_combinations, parity_space > 1));
|
||||
const total_weight = Object.values(osd_weights).reduce((a, c) => Number(a) + Number(c), 0);
|
||||
const all_pgs = Object.values(make_cyclic(combinator.random_combinations(), parity_space));
|
||||
const pg_per_osd = {};
|
||||
for (const pg of all_pgs)
|
||||
{
|
||||
|
@ -69,15 +69,15 @@ async function optimize_initial({ osd_tree, pg_count, pg_size = 3, pg_minsize =
|
|||
pg_per_osd[osd].push((i >= pg_minsize ? parity_space+'*' : '')+"pg_"+pg.join("_"));
|
||||
}
|
||||
}
|
||||
const pg_effsize = Math.min(pg_minsize, Object.keys(osd_tree).length)
|
||||
+ Math.max(0, Math.min(pg_size, Object.keys(osd_tree).length) - pg_minsize) * parity_space;
|
||||
let pg_effsize = all_pgs.reduce((a, c) => Math.max(a, c.filter(e => e != NO_OSD).length), 0);
|
||||
pg_effsize = Math.min(pg_minsize, pg_effsize) + Math.max(0, Math.min(pg_size, pg_effsize) - pg_minsize) * parity_space;
|
||||
let lp = '';
|
||||
lp += "max: "+all_pgs.map(pg => 'pg_'+pg.join('_')).join(' + ')+";\n";
|
||||
for (const osd in pg_per_osd)
|
||||
{
|
||||
if (osd !== NO_OSD)
|
||||
{
|
||||
let osd_pg_count = all_weights[osd]/total_weight*pg_effsize*pg_count;
|
||||
let osd_pg_count = osd_weights[osd]/total_weight*pg_effsize*pg_count;
|
||||
lp += pg_per_osd[osd].join(' + ')+' <= '+osd_pg_count+';\n';
|
||||
}
|
||||
}
|
||||
|
@ -93,7 +93,7 @@ async function optimize_initial({ osd_tree, pg_count, pg_size = 3, pg_minsize =
|
|||
throw new Error('Problem is infeasible or unbounded - is it a bug?');
|
||||
}
|
||||
const int_pgs = make_int_pgs(lp_result.vars, pg_count, ordered);
|
||||
const eff = pg_list_space_efficiency(int_pgs, all_weights, pg_minsize, parity_space);
|
||||
const eff = pg_list_space_efficiency(int_pgs, osd_weights, pg_minsize, parity_space);
|
||||
const res = {
|
||||
score: lp_result.score,
|
||||
weights: lp_result.vars,
|
||||
|
@ -104,6 +104,22 @@ async function optimize_initial({ osd_tree, pg_count, pg_size = 3, pg_minsize =
|
|||
return res;
|
||||
}
|
||||
|
||||
function make_cyclic(pgs, parity_space)
|
||||
{
|
||||
if (parity_space > 1)
|
||||
{
|
||||
for (const pg in pgs)
|
||||
{
|
||||
for (let i = 1; i < pg.size; i++)
|
||||
{
|
||||
const cyclic = [ ...pg.slice(i), ...pg.slice(0, i) ];
|
||||
pgs['pg_'+cyclic.join('_')] = cyclic;
|
||||
}
|
||||
}
|
||||
}
|
||||
return pgs;
|
||||
}
|
||||
|
||||
function shuffle(array)
|
||||
{
|
||||
for (let i = array.length - 1, j, x; i > 0; i--)
|
||||
|
@ -199,7 +215,7 @@ function calc_intersect_weights(old_pg_size, pg_size, pg_count, prev_weights, al
|
|||
{
|
||||
const intersect_count = ordered
|
||||
? pg.reduce((a, osd, i) => a + (prev_hash[osd] == 1+i ? 1 : 0), 0)
|
||||
: pg.reduce((a, osd, i) => a + (prev_hash[osd] ? 1 : 0), 0);
|
||||
: pg.reduce((a, osd) => a + (prev_hash[osd] ? 1 : 0), 0);
|
||||
if (max_int < intersect_count)
|
||||
{
|
||||
max_int = intersect_count;
|
||||
|
@ -216,47 +232,17 @@ function calc_intersect_weights(old_pg_size, pg_size, pg_count, prev_weights, al
|
|||
return move_weights;
|
||||
}
|
||||
|
||||
function add_valid_previous(osd_tree, prev_weights, all_pgs)
|
||||
{
|
||||
// Add previous combinations that are still valid
|
||||
const hosts = Object.keys(osd_tree).sort();
|
||||
const host_per_osd = {};
|
||||
for (const host in osd_tree)
|
||||
{
|
||||
for (const osd in osd_tree[host])
|
||||
{
|
||||
host_per_osd[osd] = host;
|
||||
}
|
||||
}
|
||||
skip_pg: for (const pg_name in prev_weights)
|
||||
{
|
||||
const seen_hosts = {};
|
||||
const pg = pg_name.substr(3).split(/_/);
|
||||
for (const osd of pg)
|
||||
{
|
||||
if (!host_per_osd[osd] || seen_hosts[host_per_osd[osd]])
|
||||
{
|
||||
continue skip_pg;
|
||||
}
|
||||
seen_hosts[host_per_osd[osd]] = true;
|
||||
}
|
||||
if (!all_pgs[pg_name])
|
||||
{
|
||||
all_pgs[pg_name] = pg;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Try to minimize data movement
|
||||
async function optimize_change({ prev_pgs: prev_int_pgs, osd_tree, pg_size = 3, pg_minsize = 2, max_combinations = 10000, parity_space = 1, ordered = false })
|
||||
async function optimize_change({ prev_pgs: prev_int_pgs, osd_weights, combinator, pg_size = 3, pg_minsize = 2, parity_space = 1, ordered = false })
|
||||
{
|
||||
if (!osd_tree)
|
||||
if (!osd_weights)
|
||||
{
|
||||
return null;
|
||||
}
|
||||
// FIXME: use parity_chunks with parity_space instead of pg_minsize
|
||||
const pg_effsize = Math.min(pg_minsize, Object.keys(osd_tree).length)
|
||||
+ Math.max(0, Math.min(pg_size, Object.keys(osd_tree).length) - pg_minsize) * parity_space;
|
||||
let all_pgs = make_cyclic(combinator.random_combinations(), parity_space);
|
||||
let pg_effsize = Object.values(all_pgs).reduce((a, c) => Math.max(a, c.filter(e => e != NO_OSD).length), 0);
|
||||
pg_effsize = Math.min(pg_minsize, pg_effsize) + Math.max(0, Math.min(pg_size, pg_effsize) - pg_minsize) * parity_space;
|
||||
const pg_count = prev_int_pgs.length;
|
||||
const prev_weights = {};
|
||||
const prev_pg_per_osd = {};
|
||||
|
@ -273,10 +259,13 @@ async function optimize_change({ prev_pgs: prev_int_pgs, osd_tree, pg_size = 3,
|
|||
}
|
||||
const old_pg_size = prev_int_pgs[0].length;
|
||||
// Get all combinations
|
||||
let all_pgs = random_combinations(osd_tree, pg_size, max_combinations, parity_space > 1);
|
||||
if (old_pg_size == pg_size)
|
||||
{
|
||||
add_valid_previous(osd_tree, prev_weights, all_pgs);
|
||||
const still_valid = combinator.check_combinations(Object.keys(prev_weights).map(pg_name => pg_name.substr(3).split('_')));
|
||||
for (const pg of still_valid)
|
||||
{
|
||||
all_pgs['pg_'+pg.join('_')] = pg;
|
||||
}
|
||||
}
|
||||
all_pgs = Object.values(all_pgs);
|
||||
const pg_per_osd = {};
|
||||
|
@ -295,8 +284,7 @@ async function optimize_change({ prev_pgs: prev_int_pgs, osd_tree, pg_size = 3,
|
|||
// Calculate total weight - old PG weights
|
||||
const all_pg_names = all_pgs.map(pg => 'pg_'+pg.join('_'));
|
||||
const all_pgs_hash = all_pg_names.reduce((a, c) => { a[c] = true; return a; }, {});
|
||||
const all_weights = Object.assign({}, ...Object.values(osd_tree));
|
||||
const total_weight = Object.values(all_weights).reduce((a, c) => Number(a) + Number(c), 0);
|
||||
const total_weight = Object.values(osd_weights).reduce((a, c) => Number(a) + Number(c), 0);
|
||||
// Generate the LP problem
|
||||
let lp = '';
|
||||
lp += 'max: '+all_pg_names.map(pg_name => (
|
||||
|
@ -311,7 +299,7 @@ async function optimize_change({ prev_pgs: prev_int_pgs, osd_tree, pg_size = 3,
|
|||
)).join(' + ');
|
||||
const rm_osd_pg_count = (prev_pg_per_osd[osd]||[])
|
||||
.reduce((a, [ old_pg_name, space ]) => (a + (all_pgs_hash[old_pg_name] ? space : 0)), 0);
|
||||
const osd_pg_count = all_weights[osd]*pg_effsize/total_weight*pg_count - rm_osd_pg_count;
|
||||
const osd_pg_count = osd_weights[osd]*pg_effsize/total_weight*pg_count - rm_osd_pg_count;
|
||||
lp += osd_sum + ' <= ' + osd_pg_count + ';\n';
|
||||
}
|
||||
}
|
||||
|
@ -421,7 +409,7 @@ async function optimize_change({ prev_pgs: prev_int_pgs, osd_tree, pg_size = 3,
|
|||
int_pgs: new_pgs,
|
||||
differs,
|
||||
osd_differs,
|
||||
space: pg_effsize * pg_list_space_efficiency(new_pgs, all_weights, pg_minsize, parity_space),
|
||||
space: pg_effsize * pg_list_space_efficiency(new_pgs, osd_weights, pg_minsize, parity_space),
|
||||
total_space: total_weight,
|
||||
};
|
||||
}
|
||||
|
@ -502,198 +490,6 @@ function put_aligned_pgs(aligned_pgs, int_pgs, prev_int_pgs, keygen)
|
|||
}
|
||||
}
|
||||
|
||||
// Convert multi-level osd_tree = { level: number|string, id?: string, size?: number, children?: osd_tree }[]
|
||||
// levels = { string: number }
|
||||
// to a two-level osd_tree suitable for all_combinations()
|
||||
function flatten_tree(osd_tree, levels, failure_domain_level, osd_level, domains = {}, i = { i: 1 })
|
||||
{
|
||||
osd_level = levels[osd_level] || osd_level;
|
||||
failure_domain_level = levels[failure_domain_level] || failure_domain_level;
|
||||
for (const node of osd_tree)
|
||||
{
|
||||
if ((levels[node.level] || node.level) < failure_domain_level)
|
||||
{
|
||||
flatten_tree(node.children||[], levels, failure_domain_level, osd_level, domains, i);
|
||||
}
|
||||
else
|
||||
{
|
||||
domains['dom'+(i.i++)] = extract_osds([ node ], levels, osd_level);
|
||||
}
|
||||
}
|
||||
return domains;
|
||||
}
|
||||
|
||||
function extract_osds(osd_tree, levels, osd_level, osds = {})
|
||||
{
|
||||
for (const node of osd_tree)
|
||||
{
|
||||
if ((levels[node.level] || node.level) >= osd_level)
|
||||
{
|
||||
osds[node.id] = node.size;
|
||||
}
|
||||
else
|
||||
{
|
||||
extract_osds(node.children||[], levels, osd_level, osds);
|
||||
}
|
||||
}
|
||||
return osds;
|
||||
}
|
||||
|
||||
// ordered = don't treat (x,y) and (y,x) as equal
|
||||
function random_combinations(osd_tree, pg_size, count, ordered)
|
||||
{
|
||||
let seed = 0x5f020e43;
|
||||
let rng = () =>
|
||||
{
|
||||
seed ^= seed << 13;
|
||||
seed ^= seed >> 17;
|
||||
seed ^= seed << 5;
|
||||
return seed + 2147483648;
|
||||
};
|
||||
const osds = Object.keys(osd_tree).reduce((a, c) => { a[c] = Object.keys(osd_tree[c]).sort(); return a; }, {});
|
||||
const hosts = Object.keys(osd_tree).sort().filter(h => osds[h].length > 0);
|
||||
const r = {};
|
||||
// Generate random combinations including each OSD at least once
|
||||
for (let h = 0; h < hosts.length; h++)
|
||||
{
|
||||
for (let o = 0; o < osds[hosts[h]].length; o++)
|
||||
{
|
||||
const pg = [ osds[hosts[h]][o] ];
|
||||
const cur_hosts = [ ...hosts ];
|
||||
cur_hosts.splice(h, 1);
|
||||
for (let i = 1; i < pg_size && i < hosts.length; i++)
|
||||
{
|
||||
const next_host = rng() % cur_hosts.length;
|
||||
const next_osd = rng() % osds[cur_hosts[next_host]].length;
|
||||
pg.push(osds[cur_hosts[next_host]][next_osd]);
|
||||
cur_hosts.splice(next_host, 1);
|
||||
}
|
||||
const cyclic_pgs = [ pg ];
|
||||
if (ordered)
|
||||
{
|
||||
for (let i = 1; i < pg.size; i++)
|
||||
{
|
||||
cyclic_pgs.push([ ...pg.slice(i), ...pg.slice(0, i) ]);
|
||||
}
|
||||
}
|
||||
for (const pg of cyclic_pgs)
|
||||
{
|
||||
while (pg.length < pg_size)
|
||||
{
|
||||
pg.push(NO_OSD);
|
||||
}
|
||||
r['pg_'+pg.join('_')] = pg;
|
||||
}
|
||||
}
|
||||
}
|
||||
// Generate purely random combinations
|
||||
while (count > 0)
|
||||
{
|
||||
let host_idx = [];
|
||||
const cur_hosts = [ ...hosts.map((h, i) => i) ];
|
||||
const max_hosts = pg_size < hosts.length ? pg_size : hosts.length;
|
||||
if (ordered)
|
||||
{
|
||||
for (let i = 0; i < max_hosts; i++)
|
||||
{
|
||||
const r = rng() % cur_hosts.length;
|
||||
host_idx[i] = cur_hosts[r];
|
||||
cur_hosts.splice(r, 1);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for (let i = 0; i < max_hosts; i++)
|
||||
{
|
||||
const r = rng() % (cur_hosts.length - (max_hosts - i - 1));
|
||||
host_idx[i] = cur_hosts[r];
|
||||
cur_hosts.splice(0, r+1);
|
||||
}
|
||||
}
|
||||
let pg = host_idx.map(h => osds[hosts[h]][rng() % osds[hosts[h]].length]);
|
||||
while (pg.length < pg_size)
|
||||
{
|
||||
pg.push(NO_OSD);
|
||||
}
|
||||
r['pg_'+pg.join('_')] = pg;
|
||||
count--;
|
||||
}
|
||||
return r;
|
||||
}
|
||||
|
||||
// Super-stupid algorithm. Given the current OSD tree, generate all possible OSD combinations
|
||||
// osd_tree = { failure_domain1: { osd1: size1, ... }, ... }
|
||||
// ordered = return combinations without duplicates having different order
|
||||
function all_combinations(osd_tree, pg_size, ordered, count)
|
||||
{
|
||||
const hosts = Object.keys(osd_tree).sort();
|
||||
const osds = Object.keys(osd_tree).reduce((a, c) => { a[c] = Object.keys(osd_tree[c]).sort(); return a; }, {});
|
||||
while (hosts.length < pg_size)
|
||||
{
|
||||
osds[NO_OSD] = [ NO_OSD ];
|
||||
hosts.push(NO_OSD);
|
||||
}
|
||||
let host_idx = [];
|
||||
let osd_idx = [];
|
||||
for (let i = 0; i < pg_size; i++)
|
||||
{
|
||||
host_idx.push(i);
|
||||
osd_idx.push(0);
|
||||
}
|
||||
const r = [];
|
||||
while (!count || count < 0 || r.length < count)
|
||||
{
|
||||
r.push(host_idx.map((hi, i) => osds[hosts[hi]][osd_idx[i]]));
|
||||
let inc = pg_size-1;
|
||||
while (inc >= 0)
|
||||
{
|
||||
osd_idx[inc]++;
|
||||
if (osd_idx[inc] >= osds[hosts[host_idx[inc]]].length)
|
||||
{
|
||||
osd_idx[inc] = 0;
|
||||
inc--;
|
||||
}
|
||||
else
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (inc < 0)
|
||||
{
|
||||
// no osds left in the current host combination, select the next one
|
||||
inc = pg_size-1;
|
||||
same_again: while (inc >= 0)
|
||||
{
|
||||
host_idx[inc]++;
|
||||
for (let prev_host = 0; prev_host < inc; prev_host++)
|
||||
{
|
||||
if (host_idx[prev_host] == host_idx[inc])
|
||||
{
|
||||
continue same_again;
|
||||
}
|
||||
}
|
||||
if (host_idx[inc] < (ordered ? hosts.length-(pg_size-1-inc) : hosts.length))
|
||||
{
|
||||
while ((++inc) < pg_size)
|
||||
{
|
||||
host_idx[inc] = (ordered ? host_idx[inc-1]+1 : 0);
|
||||
}
|
||||
break;
|
||||
}
|
||||
else
|
||||
{
|
||||
inc--;
|
||||
}
|
||||
}
|
||||
if (inc < 0)
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
return r;
|
||||
}
|
||||
|
||||
function pg_weights_space_efficiency(weights, pg_count, osd_sizes)
|
||||
{
|
||||
const per_osd = {};
|
||||
|
@ -752,11 +548,8 @@ module.exports = {
|
|||
pg_weights_space_efficiency,
|
||||
pg_list_space_efficiency,
|
||||
pg_per_osd_space_efficiency,
|
||||
flatten_tree,
|
||||
|
||||
lp_solve,
|
||||
make_int_pgs,
|
||||
align_pgs,
|
||||
random_combinations,
|
||||
all_combinations,
|
||||
};
|
||||
|
|
295
mon/mon.js
295
mon/mon.js
|
@ -6,6 +6,8 @@ const http = require('http');
|
|||
const crypto = require('crypto');
|
||||
const os = require('os');
|
||||
const WebSocket = require('ws');
|
||||
const { RuleCombinator, parse_level_indexes, parse_pg_dsl } = require('./dsl_pgs.js');
|
||||
const { SimpleCombinator, flatten_tree } = require('./simple_pgs.js');
|
||||
const LPOptimizer = require('./lp-optimizer.js');
|
||||
const stableStringify = require('./stable-stringify.js');
|
||||
const PGUtil = require('./PGUtil.js');
|
||||
|
@ -35,7 +37,6 @@ const etcd_allow = new RegExp('^'+[
|
|||
'pg/state/[1-9]\\d*/[1-9]\\d*',
|
||||
'pg/stats/[1-9]\\d*/[1-9]\\d*',
|
||||
'pg/history/[1-9]\\d*/[1-9]\\d*',
|
||||
'pool/stats/[1-9]\\d*',
|
||||
'history/last_clean_pgs',
|
||||
'inode/stats/[1-9]\\d*/\\d+',
|
||||
'pool/stats/[1-9]\\d*',
|
||||
|
@ -63,6 +64,7 @@ const etcd_tree = {
|
|||
mon_stats_timeout: 1000, // ms. min: 100
|
||||
osd_out_time: 600, // seconds. min: 0
|
||||
placement_levels: { datacenter: 1, rack: 2, host: 3, osd: 4, ... },
|
||||
use_old_pg_combinator: false,
|
||||
// client and osd
|
||||
tcp_header_buffer_size: 65536,
|
||||
use_sync_send_recv: false,
|
||||
|
@ -88,6 +90,8 @@ const etcd_tree = {
|
|||
client_max_writeback_iodepth: 256,
|
||||
client_retry_interval: 50, // ms. min: 10
|
||||
client_eio_retry_interval: 1000, // ms
|
||||
client_retry_enospc: true,
|
||||
osd_nearfull_ratio: 0.95,
|
||||
// client and osd - configurable online
|
||||
log_level: 0,
|
||||
peer_connect_interval: 5, // seconds. min: 1
|
||||
|
@ -185,7 +189,12 @@ const etcd_tree = {
|
|||
// number of parity chunks, required for EC
|
||||
parity_chunks?: 1,
|
||||
pg_count: 100,
|
||||
failure_domain: 'host',
|
||||
// default is failure_domain=host
|
||||
failure_domain?: 'host',
|
||||
// additional failure domain rules; failure_domain=x is equivalent to x=123..N
|
||||
level_placement?: 'dc=112233 host=123456',
|
||||
raw_placement?: 'any, dc=1 host!=1, dc=1 host!=(1,2)',
|
||||
old_combinator: false,
|
||||
max_osd_combinations: 10000,
|
||||
// block_size, bitmap_granularity, immediate_commit must match all OSDs used in that pool
|
||||
block_size: 131072,
|
||||
|
@ -205,7 +214,7 @@ const etcd_tree = {
|
|||
}, */
|
||||
pools: {},
|
||||
osd: {
|
||||
/* <id>: { reweight?: 1, tags?: [ 'nvme', ... ] }, ... */
|
||||
/* <id>: { reweight?: 1, tags?: [ 'nvme', ... ], noout?: true }, ... */
|
||||
},
|
||||
/* pgs: {
|
||||
hash: string,
|
||||
|
@ -575,7 +584,7 @@ class Mon
|
|||
now = Date.now();
|
||||
}
|
||||
tried[base] = now;
|
||||
const ok = await new Promise((ok, no) =>
|
||||
const ok = await new Promise(ok =>
|
||||
{
|
||||
const timer_id = setTimeout(() =>
|
||||
{
|
||||
|
@ -737,6 +746,7 @@ class Mon
|
|||
this.save_last_clean_running = true;
|
||||
// last_clean_pgs is used to avoid extra data move when observing a series of changes in the cluster
|
||||
const new_clean_pgs = { items: {} };
|
||||
// eslint-disable-next-line indent
|
||||
next_pool:
|
||||
for (const pool_id in this.state.config.pools)
|
||||
{
|
||||
|
@ -819,6 +829,7 @@ class Mon
|
|||
async become_master()
|
||||
{
|
||||
const state = { ...this.get_mon_state(), id: ''+this.etcd_lease_id };
|
||||
// eslint-disable-next-line no-constant-condition
|
||||
while (1)
|
||||
{
|
||||
const res = await this.etcd_call('/kv/txn', {
|
||||
|
@ -861,34 +872,21 @@ class Mon
|
|||
const levels = this.config.placement_levels||{};
|
||||
levels.host = levels.host || 100;
|
||||
levels.osd = levels.osd || 101;
|
||||
const tree = { '': { children: [] } };
|
||||
const tree = {};
|
||||
let up_osds = {};
|
||||
for (const node_id in this.state.config.node_placement||{})
|
||||
{
|
||||
const node_cfg = this.state.config.node_placement[node_id];
|
||||
if (/^\d+$/.exec(node_id))
|
||||
{
|
||||
node_cfg.level = 'osd';
|
||||
}
|
||||
if (!node_id || !node_cfg.level || !levels[node_cfg.level])
|
||||
{
|
||||
// All nodes must have non-empty IDs and valid levels
|
||||
continue;
|
||||
}
|
||||
tree[node_id] = { id: node_id, level: node_cfg.level, parent: node_cfg.parent, children: [] };
|
||||
}
|
||||
// This requires monitor system time to be in sync with OSD system times (at least to some extent)
|
||||
const down_time = Date.now()/1000 - this.config.osd_out_time;
|
||||
for (const osd_num of this.all_osds().sort((a, b) => a - b))
|
||||
{
|
||||
const stat = this.state.osd.stats[osd_num];
|
||||
if (stat && stat.size && (this.state.osd.state[osd_num] || Number(stat.time) >= down_time))
|
||||
const osd_cfg = this.state.config.osd[osd_num];
|
||||
let reweight = osd_cfg == null ? 1 : Number(osd_cfg.reweight);
|
||||
if (reweight < 0 || isNaN(reweight))
|
||||
reweight = 1;
|
||||
if (stat && stat.size && reweight && (this.state.osd.state[osd_num] || Number(stat.time) >= down_time ||
|
||||
osd_cfg && osd_cfg.noout))
|
||||
{
|
||||
// Numeric IDs are reserved for OSDs
|
||||
const osd_cfg = this.state.config.osd[osd_num];
|
||||
let reweight = osd_cfg && Number(osd_cfg.reweight);
|
||||
if (reweight < 0 || isNaN(reweight))
|
||||
reweight = 1;
|
||||
if (this.state.osd.state[osd_num] && reweight > 0)
|
||||
{
|
||||
// React to down OSDs immediately
|
||||
|
@ -916,6 +914,43 @@ class Mon
|
|||
}
|
||||
}
|
||||
}
|
||||
for (const node_id in this.state.config.node_placement||{})
|
||||
{
|
||||
const node_cfg = this.state.config.node_placement[node_id];
|
||||
if (/^\d+$/.exec(node_id))
|
||||
{
|
||||
node_cfg.level = 'osd';
|
||||
}
|
||||
if (!node_id || !node_cfg.level || !levels[node_cfg.level] ||
|
||||
node_cfg.level === 'osd' && !tree[node_id])
|
||||
{
|
||||
// All nodes must have non-empty IDs and valid levels
|
||||
// OSDs have to actually exist
|
||||
continue;
|
||||
}
|
||||
tree[node_id] = tree[node_id] || {};
|
||||
tree[node_id].id = node_id;
|
||||
tree[node_id].level = node_cfg.level;
|
||||
tree[node_id].parent = node_cfg.parent;
|
||||
if (node_cfg.level !== 'osd')
|
||||
{
|
||||
tree[node_id].children = [];
|
||||
}
|
||||
}
|
||||
return { up_osds, levels, osd_tree: tree };
|
||||
}
|
||||
|
||||
make_hier_tree(tree)
|
||||
{
|
||||
const levels = this.config.placement_levels||{};
|
||||
levels.host = levels.host || 100;
|
||||
levels.osd = levels.osd || 101;
|
||||
tree = { ...tree };
|
||||
for (const node_id in tree)
|
||||
{
|
||||
tree[node_id] = { ...tree[node_id], children: [] };
|
||||
}
|
||||
tree[''] = { children: [] };
|
||||
for (const node_id in tree)
|
||||
{
|
||||
if (node_id === '' || tree[node_id].level === 'osd' && (!tree[node_id].size || tree[node_id].size <= 0))
|
||||
|
@ -930,9 +965,27 @@ class Mon
|
|||
// Parent's level must be less than child's; OSDs must be leaves
|
||||
const parent = parent_level && parent_level < node_level ? node_cfg.parent : '';
|
||||
tree[parent].children.push(tree[node_id]);
|
||||
delete node_cfg.parent;
|
||||
}
|
||||
return { up_osds, levels, osd_tree: tree };
|
||||
// Delete empty nodes
|
||||
let deleted = 0;
|
||||
do
|
||||
{
|
||||
deleted = 0;
|
||||
for (const node_id in tree)
|
||||
{
|
||||
if (tree[node_id].level !== 'osd' && (!tree[node_id].children || !tree[node_id].children.length))
|
||||
{
|
||||
const parent = tree[node_id].parent;
|
||||
if (parent)
|
||||
{
|
||||
tree[parent].children = tree[parent].children.filter(c => c != tree[node_id]);
|
||||
}
|
||||
deleted++;
|
||||
delete tree[node_id];
|
||||
}
|
||||
}
|
||||
} while (deleted > 0);
|
||||
return tree;
|
||||
}
|
||||
|
||||
async stop_all_pgs(pool_id)
|
||||
|
@ -968,7 +1021,7 @@ class Mon
|
|||
const key = b64(this.etcd_prefix+'/osd/state/'+osd_num);
|
||||
checks.push({ key, target: 'MOD', result: 'LESS', mod_revision: ''+this.etcd_watch_revision });
|
||||
}
|
||||
const res = await this.etcd_call('/kv/txn', {
|
||||
await this.etcd_call('/kv/txn', {
|
||||
compare: [
|
||||
{ key: b64(this.etcd_prefix+'/mon/master'), target: 'LEASE', lease: ''+this.etcd_lease_id },
|
||||
{ key: b64(this.etcd_prefix+'/config/pgs'), target: 'MOD', mod_revision: ''+this.etcd_watch_revision, result: 'LESS' },
|
||||
|
@ -1096,7 +1149,6 @@ class Mon
|
|||
pool_cfg.pg_minsize = Math.floor(pool_cfg.pg_minsize);
|
||||
pool_cfg.parity_chunks = Math.floor(pool_cfg.parity_chunks) || undefined;
|
||||
pool_cfg.pg_count = Math.floor(pool_cfg.pg_count);
|
||||
pool_cfg.failure_domain = pool_cfg.failure_domain || 'host';
|
||||
pool_cfg.max_osd_combinations = Math.floor(pool_cfg.max_osd_combinations) || 10000;
|
||||
if (!/^[1-9]\d*$/.exec(''+pool_id))
|
||||
{
|
||||
|
@ -1176,10 +1228,45 @@ class Mon
|
|||
console.log('Pool '+pool_id+' has invalid primary_affinity_tags (must be a string or array of strings)');
|
||||
return false;
|
||||
}
|
||||
if (!this.get_pg_rules(pool_id, pool_cfg, true))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
filter_osds_by_tags(orig_tree, flat_tree, tags)
|
||||
filter_osds_by_root_node(pool_tree, root_node)
|
||||
{
|
||||
if (!root_node)
|
||||
{
|
||||
return;
|
||||
}
|
||||
let hier_tree = this.make_hier_tree(pool_tree);
|
||||
let included = [ ...(hier_tree[root_node] || {}).children||[] ];
|
||||
for (let i = 0; i < included.length; i++)
|
||||
{
|
||||
if (included[i].children)
|
||||
{
|
||||
included.splice(i+1, 0, ...included[i].children);
|
||||
}
|
||||
}
|
||||
let cur = pool_tree[root_node] || {};
|
||||
while (cur && cur.id)
|
||||
{
|
||||
included.unshift(cur);
|
||||
cur = pool_tree[cur.parent||''];
|
||||
}
|
||||
included = included.reduce((a, c) => { a[c.id||''] = true; return a; }, {});
|
||||
for (const item in pool_tree)
|
||||
{
|
||||
if (!included[item])
|
||||
{
|
||||
delete pool_tree[item];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
filter_osds_by_tags(orig_tree, tags)
|
||||
{
|
||||
if (!tags)
|
||||
{
|
||||
|
@ -1187,30 +1274,22 @@ class Mon
|
|||
}
|
||||
for (const tag of (tags instanceof Array ? tags : [ tags ]))
|
||||
{
|
||||
for (const host in flat_tree)
|
||||
for (const osd in orig_tree)
|
||||
{
|
||||
let found = 0;
|
||||
for (const osd in flat_tree[host])
|
||||
if (orig_tree[osd].level === 'osd' &&
|
||||
(!orig_tree[osd].tags || !orig_tree[osd].tags[tag]))
|
||||
{
|
||||
if (!orig_tree[osd].tags || !orig_tree[osd].tags[tag])
|
||||
delete flat_tree[host][osd];
|
||||
else
|
||||
found++;
|
||||
}
|
||||
if (!found)
|
||||
{
|
||||
delete flat_tree[host];
|
||||
delete orig_tree[osd];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
filter_osds_by_block_layout(flat_tree, block_size, bitmap_granularity, immediate_commit)
|
||||
filter_osds_by_block_layout(orig_tree, block_size, bitmap_granularity, immediate_commit)
|
||||
{
|
||||
for (const host in flat_tree)
|
||||
for (const osd in orig_tree)
|
||||
{
|
||||
let found = 0;
|
||||
for (const osd in flat_tree[host])
|
||||
if (orig_tree[osd].level === 'osd')
|
||||
{
|
||||
const osd_stat = this.state.osd.stats[osd];
|
||||
if (osd_stat && (osd_stat.bs_block_size && osd_stat.bs_block_size != block_size ||
|
||||
|
@ -1218,16 +1297,8 @@ class Mon
|
|||
osd_stat.immediate_commit == 'small' && immediate_commit == 'all' ||
|
||||
osd_stat.immediate_commit == 'none' && immediate_commit != 'none'))
|
||||
{
|
||||
delete flat_tree[host][osd];
|
||||
delete orig_tree[osd];
|
||||
}
|
||||
else
|
||||
{
|
||||
found++;
|
||||
}
|
||||
}
|
||||
if (!found)
|
||||
{
|
||||
delete flat_tree[host];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -1237,12 +1308,84 @@ class Mon
|
|||
let aff_osds = up_osds;
|
||||
if (pool_cfg.primary_affinity_tags)
|
||||
{
|
||||
aff_osds = { ...up_osds };
|
||||
this.filter_osds_by_tags(osd_tree, { x: aff_osds }, pool_cfg.primary_affinity_tags);
|
||||
aff_osds = Object.keys(up_osds).reduce((a, c) => { a[c] = osd_tree[c]; return a; }, {});
|
||||
this.filter_osds_by_tags(aff_osds, pool_cfg.primary_affinity_tags);
|
||||
for (const osd in aff_osds)
|
||||
{
|
||||
aff_osds[osd] = true;
|
||||
}
|
||||
}
|
||||
return aff_osds;
|
||||
}
|
||||
|
||||
get_pg_rules(pool_id, pool_cfg, warn)
|
||||
{
|
||||
if (pool_cfg.level_placement)
|
||||
{
|
||||
const pg_size = (0|pool_cfg.pg_size);
|
||||
let rules = pool_cfg.level_placement;
|
||||
if (typeof rules === 'string')
|
||||
{
|
||||
rules = rules.split(/\s+/).map(s => s.split(/=/, 2)).reduce((a, c) => { a[c[0]] = c[1]; return a; }, {});
|
||||
}
|
||||
else
|
||||
{
|
||||
rules = { ...rules };
|
||||
}
|
||||
// Always add failure_domain to prevent rules from being totally incorrect
|
||||
const all_diff = [];
|
||||
for (let i = 1; i <= pg_size; i++)
|
||||
{
|
||||
all_diff.push(i);
|
||||
}
|
||||
rules[pool_cfg.failure_domain || 'host'] = all_diff;
|
||||
const levels = this.config.placement_levels||{};
|
||||
levels.host = levels.host || 100;
|
||||
levels.osd = levels.osd || 101;
|
||||
for (const k in rules)
|
||||
{
|
||||
if (!levels[k] || typeof rules[k] !== 'string' &&
|
||||
(!(rules[k] instanceof Array) ||
|
||||
rules[k].filter(s => typeof s !== 'string' && typeof s !== 'number').length > 0))
|
||||
{
|
||||
if (warn)
|
||||
console.log('Pool '+pool_id+' configuration is invalid: level_placement should be { [level]: string | (string|number)[] }');
|
||||
return null;
|
||||
}
|
||||
else if (rules[k].length != pg_size)
|
||||
{
|
||||
if (warn)
|
||||
console.log('Pool '+pool_id+' configuration is invalid: values in level_placement should contain exactly pg_size ('+pg_size+') items');
|
||||
return null;
|
||||
}
|
||||
}
|
||||
return parse_level_indexes(rules);
|
||||
}
|
||||
else if (typeof pool_cfg.raw_placement === 'string')
|
||||
{
|
||||
try
|
||||
{
|
||||
return parse_pg_dsl(pool_cfg.raw_placement);
|
||||
}
|
||||
catch (e)
|
||||
{
|
||||
if (warn)
|
||||
console.log('Pool '+pool_id+' configuration is invalid: invalid raw_placement: '+e.message);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
let rules = [ [] ];
|
||||
let prev = [ 1 ];
|
||||
for (let i = 1; i < pool_cfg.pg_size; i++)
|
||||
{
|
||||
rules.push([ [ pool_cfg.failure_domain||'host', '!=', prev ] ]);
|
||||
prev = [ ...prev, i+1 ];
|
||||
}
|
||||
return rules;
|
||||
}
|
||||
}
|
||||
|
||||
async generate_pool_pgs(pool_id, osd_tree, levels)
|
||||
{
|
||||
const pool_cfg = this.state.config.pools[pool_id];
|
||||
|
@ -1250,16 +1393,16 @@ class Mon
|
|||
{
|
||||
return null;
|
||||
}
|
||||
let pool_tree = osd_tree[pool_cfg.root_node || ''];
|
||||
pool_tree = pool_tree ? pool_tree.children : [];
|
||||
pool_tree = LPOptimizer.flatten_tree(pool_tree, levels, pool_cfg.failure_domain, 'osd');
|
||||
this.filter_osds_by_tags(osd_tree, pool_tree, pool_cfg.osd_tags);
|
||||
let pool_tree = { ...osd_tree };
|
||||
this.filter_osds_by_root_node(pool_tree, pool_cfg.root_node);
|
||||
this.filter_osds_by_tags(pool_tree, pool_cfg.osd_tags);
|
||||
this.filter_osds_by_block_layout(
|
||||
pool_tree,
|
||||
pool_cfg.block_size || this.config.block_size || 131072,
|
||||
pool_cfg.bitmap_granularity || this.config.bitmap_granularity || 4096,
|
||||
pool_cfg.immediate_commit || this.config.immediate_commit || 'none'
|
||||
);
|
||||
pool_tree = this.make_hier_tree(pool_tree);
|
||||
// First try last_clean_pgs to minimize data movement
|
||||
let prev_pgs = [];
|
||||
for (const pg in ((this.state.history.last_clean_pgs.items||{})[pool_id]||{}))
|
||||
|
@ -1276,11 +1419,15 @@ class Mon
|
|||
}
|
||||
const old_pg_count = prev_pgs.length;
|
||||
const optimize_cfg = {
|
||||
osd_tree: pool_tree,
|
||||
osd_weights: Object.values(pool_tree).filter(item => item.level === 'osd').reduce((a, c) => { a[c.id] = c.size; return a; }, {}),
|
||||
combinator: !this.config.use_old_pg_combinator || pool_cfg.level_placement || pool_cfg.raw_placement
|
||||
// new algorithm:
|
||||
? new RuleCombinator(pool_tree, this.get_pg_rules(pool_id, pool_cfg), pool_cfg.max_osd_combinations)
|
||||
// old algorithm:
|
||||
: new SimpleCombinator(flatten_tree(pool_tree[''].children, levels, pool_cfg.failure_domain, 'osd'), pool_cfg.pg_size, pool_cfg.max_osd_combinations),
|
||||
pg_count: pool_cfg.pg_count,
|
||||
pg_size: pool_cfg.pg_size,
|
||||
pg_minsize: pool_cfg.pg_minsize,
|
||||
max_combinations: pool_cfg.max_osd_combinations,
|
||||
ordered: pool_cfg.scheme != 'replicated',
|
||||
};
|
||||
let optimize_result;
|
||||
|
@ -1312,7 +1459,15 @@ class Mon
|
|||
}
|
||||
console.log(`Pool ${pool_id} (${pool_cfg.name || 'unnamed'}):`);
|
||||
LPOptimizer.print_change_stats(optimize_result);
|
||||
const pg_effsize = Math.min(pool_cfg.pg_size, Object.keys(pool_tree).length);
|
||||
let pg_effsize = pool_cfg.pg_size;
|
||||
for (const pg of optimize_result.int_pgs)
|
||||
{
|
||||
const this_pg_size = pg.filter(osd => osd != LPOptimizer.NO_OSD).length;
|
||||
if (this_pg_size && this_pg_size < pg_effsize)
|
||||
{
|
||||
pg_effsize = this_pg_size;
|
||||
}
|
||||
}
|
||||
return {
|
||||
pool_id,
|
||||
pgs: optimize_result.int_pgs,
|
||||
|
@ -1349,8 +1504,8 @@ class Mon
|
|||
// Something has changed
|
||||
console.log('Pool configuration or OSD tree changed, re-optimizing');
|
||||
// First re-optimize PGs, but don't look at history yet
|
||||
const optimize_results = await Promise.all(Object.keys(this.state.config.pools)
|
||||
.map(pool_id => this.generate_pool_pgs(pool_id, osd_tree, levels)));
|
||||
const optimize_results = (await Promise.all(Object.keys(this.state.config.pools)
|
||||
.map(pool_id => this.generate_pool_pgs(pool_id, osd_tree, levels)))).filter(r => r);
|
||||
// Then apply the modification in the form of an optimistic transaction,
|
||||
// each time considering new pg/history modifications (OSDs modify it during rebalance)
|
||||
while (!await this.apply_pool_pgs(optimize_results, up_osds, osd_tree, tree_hash))
|
||||
|
@ -1394,11 +1549,14 @@ class Mon
|
|||
{
|
||||
continue;
|
||||
}
|
||||
const replicated = pool_cfg.scheme === 'replicated';
|
||||
const aff_osds = this.get_affinity_osds(pool_cfg, up_osds, osd_tree);
|
||||
this.reset_rng();
|
||||
for (let pg_num = 1; pg_num <= pool_cfg.pg_count; pg_num++)
|
||||
{
|
||||
if (!this.state.config.pgs.items[pool_id])
|
||||
{
|
||||
continue;
|
||||
}
|
||||
const pg_cfg = this.state.config.pgs.items[pool_id][pg_num];
|
||||
if (pg_cfg)
|
||||
{
|
||||
|
@ -1568,7 +1726,6 @@ class Mon
|
|||
|
||||
derive_osd_stats(st, prev, prev_diff)
|
||||
{
|
||||
const zero_stats = { op: { bps: 0n, iops: 0n, lat: 0n }, subop: { iops: 0n, lat: 0n }, recovery: { bps: 0n, iops: 0n } };
|
||||
const diff = { op_stats: {}, subop_stats: {}, recovery_stats: {}, inode_stats: {} };
|
||||
if (!st || !st.time || !prev || !prev.time || prev.time >= st.time)
|
||||
{
|
||||
|
@ -1608,7 +1765,7 @@ class Mon
|
|||
}
|
||||
for (const pool_id in st.inode_stats||{})
|
||||
{
|
||||
const pool_diff = diff.inode_stats[pool_id] = {};
|
||||
diff.inode_stats[pool_id] = {};
|
||||
for (const inode_num in st.inode_stats[pool_id])
|
||||
{
|
||||
const inode_diff = diff.inode_stats[pool_id][inode_num] = {};
|
||||
|
@ -2026,7 +2183,7 @@ class Mon
|
|||
_die(err, code)
|
||||
{
|
||||
// In fact we can just try to rejoin
|
||||
console.error(new Error(err || 'Cluster connection failed'));
|
||||
console.error(err instanceof Error ? err : new Error(err || 'Cluster connection failed'));
|
||||
process.exit(code || 2);
|
||||
}
|
||||
|
||||
|
@ -2050,7 +2207,7 @@ class Mon
|
|||
|
||||
function POST(url, body, timeout)
|
||||
{
|
||||
return new Promise((ok, no) =>
|
||||
return new Promise(ok =>
|
||||
{
|
||||
const body_text = Buffer.from(JSON.stringify(body));
|
||||
let timer_id = timeout > 0 ? setTimeout(() =>
|
||||
|
|
|
@ -0,0 +1,38 @@
|
|||
function select_murmur3(count, cb)
|
||||
{
|
||||
if (!count)
|
||||
{
|
||||
return null;
|
||||
}
|
||||
else
|
||||
{
|
||||
let i = 0, maxh = -1;
|
||||
for (let j = 0; j < count; j++)
|
||||
{
|
||||
const h = murmur3(cb(j));
|
||||
if (h > maxh)
|
||||
{
|
||||
i = j;
|
||||
maxh = h;
|
||||
}
|
||||
}
|
||||
return i;
|
||||
}
|
||||
}
|
||||
|
||||
function murmur3(s)
|
||||
{
|
||||
let hash = 0x12345678;
|
||||
for (let i = 0; i < s.length; i++)
|
||||
{
|
||||
hash ^= s.charCodeAt(i);
|
||||
hash = (hash*0x5bd1e995) & 0xFFFFFFFF;
|
||||
hash ^= (hash >> 15);
|
||||
}
|
||||
return hash;
|
||||
}
|
||||
|
||||
module.exports = {
|
||||
murmur3,
|
||||
select_murmur3,
|
||||
};
|
|
@ -1,6 +1,6 @@
|
|||
{
|
||||
"name": "vitastor-mon",
|
||||
"version": "1.5.0",
|
||||
"version": "1.6.1",
|
||||
"description": "Vitastor SDS monitor service",
|
||||
"main": "mon-main.js",
|
||||
"scripts": {
|
||||
|
@ -11,5 +11,15 @@
|
|||
"dependencies": {
|
||||
"sprintf-js": "^1.1.2",
|
||||
"ws": "^7.2.5"
|
||||
},
|
||||
"devDependencies": {
|
||||
"eslint": "^8.0.0",
|
||||
"eslint-plugin-node": "^11.1.0"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">=12.0.0"
|
||||
},
|
||||
"scripts": {
|
||||
"lint": "eslint *.js"
|
||||
}
|
||||
}
|
||||
|
|
|
@ -38,7 +38,7 @@ async function run()
|
|||
const st = await fs.stat(options.device);
|
||||
options.device_block_size = st.blksize;
|
||||
if (st.isBlockDevice())
|
||||
device_size = Number(await system("/sbin/blockdev --getsize64 "+options.device))
|
||||
device_size = Number(await system("/sbin/blockdev --getsize64 "+options.device));
|
||||
else
|
||||
device_size = st.size;
|
||||
}
|
||||
|
@ -91,7 +91,7 @@ async function run()
|
|||
|
||||
function system(cmd)
|
||||
{
|
||||
return new Promise((ok, no) => child_process.exec(cmd, { maxBuffer: 64*1024*1024 }, (err, stdout, stderr) => (err ? no(err.message) : ok(stdout))));
|
||||
return new Promise((ok, no) => child_process.exec(cmd, { maxBuffer: 64*1024*1024 }, (err, stdout/*, stderr*/) => (err ? no(err.message) : ok(stdout))));
|
||||
}
|
||||
|
||||
run().catch(err => { console.error(err); process.exit(1); });
|
||||
|
|
|
@ -0,0 +1,241 @@
|
|||
const { select_murmur3 } = require('./murmur3.js');
|
||||
|
||||
const NO_OSD = 'Z';
|
||||
|
||||
class SimpleCombinator
|
||||
{
|
||||
constructor(flat_tree, pg_size, max_combinations, ordered)
|
||||
{
|
||||
this.osd_tree = flat_tree;
|
||||
this.pg_size = pg_size;
|
||||
this.max_combinations = max_combinations;
|
||||
this.ordered = ordered;
|
||||
}
|
||||
|
||||
random_combinations()
|
||||
{
|
||||
return random_combinations(this.osd_tree, this.pg_size, this.max_combinations, this.ordered);
|
||||
}
|
||||
|
||||
check_combinations(pgs)
|
||||
{
|
||||
return check_combinations(this.osd_tree, pgs);
|
||||
}
|
||||
}
|
||||
|
||||
// Convert multi-level osd_tree = { level: number|string, id?: string, size?: number, children?: osd_tree }[]
|
||||
// levels = { string: number }
|
||||
// to a two-level osd_tree suitable for all_combinations()
|
||||
function flatten_tree(osd_tree, levels, failure_domain_level, osd_level, domains = {}, i = { i: 1 })
|
||||
{
|
||||
osd_level = levels[osd_level] || osd_level;
|
||||
failure_domain_level = levels[failure_domain_level] || failure_domain_level;
|
||||
for (const node of osd_tree)
|
||||
{
|
||||
if ((levels[node.level] || node.level) < failure_domain_level)
|
||||
{
|
||||
flatten_tree(node.children||[], levels, failure_domain_level, osd_level, domains, i);
|
||||
}
|
||||
else
|
||||
{
|
||||
domains['dom'+(i.i++)] = extract_osds([ node ], levels, osd_level);
|
||||
}
|
||||
}
|
||||
return domains;
|
||||
}
|
||||
|
||||
function extract_osds(osd_tree, levels, osd_level, osds = {})
|
||||
{
|
||||
for (const node of osd_tree)
|
||||
{
|
||||
if ((levels[node.level] || node.level) >= osd_level)
|
||||
{
|
||||
osds[node.id] = node.size;
|
||||
}
|
||||
else
|
||||
{
|
||||
extract_osds(node.children||[], levels, osd_level, osds);
|
||||
}
|
||||
}
|
||||
return osds;
|
||||
}
|
||||
|
||||
// ordered = don't treat (x,y) and (y,x) as equal
|
||||
function random_combinations(osd_tree, pg_size, count, ordered)
|
||||
{
|
||||
const osds = Object.keys(osd_tree).reduce((a, c) => { a[c] = Object.keys(osd_tree[c]).sort(); return a; }, {});
|
||||
const hosts = Object.keys(osd_tree).sort().filter(h => osds[h].length > 0);
|
||||
const r = {};
|
||||
// Generate random combinations including each OSD at least once
|
||||
for (let h = 0; h < hosts.length; h++)
|
||||
{
|
||||
for (let o = 0; o < osds[hosts[h]].length; o++)
|
||||
{
|
||||
const pg = [ osds[hosts[h]][o] ];
|
||||
const cur_hosts = [ ...hosts ];
|
||||
cur_hosts.splice(h, 1);
|
||||
for (let i = 1; i < pg_size && i < hosts.length; i++)
|
||||
{
|
||||
const next_host = select_murmur3(cur_hosts.length, i => pg[0]+':i:'+cur_hosts[i]);
|
||||
const next_osd = select_murmur3(osds[cur_hosts[next_host]].length, i => pg[0]+':i:'+osds[cur_hosts[next_host]][i]);
|
||||
pg.push(osds[cur_hosts[next_host]][next_osd]);
|
||||
cur_hosts.splice(next_host, 1);
|
||||
}
|
||||
while (pg.length < pg_size)
|
||||
{
|
||||
pg.push(NO_OSD);
|
||||
}
|
||||
r['pg_'+pg.join('_')] = pg;
|
||||
}
|
||||
}
|
||||
// Generate purely random combinations
|
||||
while (count > 0)
|
||||
{
|
||||
let host_idx = [];
|
||||
const cur_hosts = [ ...hosts.map((h, i) => i) ];
|
||||
const max_hosts = pg_size < hosts.length ? pg_size : hosts.length;
|
||||
if (ordered)
|
||||
{
|
||||
for (let i = 0; i < max_hosts; i++)
|
||||
{
|
||||
const r = select_murmur3(cur_hosts.length, i => count+':h:'+cur_hosts[i]);
|
||||
host_idx[i] = cur_hosts[r];
|
||||
cur_hosts.splice(r, 1);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for (let i = 0; i < max_hosts; i++)
|
||||
{
|
||||
const r = select_murmur3(cur_hosts.length - (max_hosts - i - 1), i => count+':h:'+cur_hosts[i]);
|
||||
host_idx[i] = cur_hosts[r];
|
||||
cur_hosts.splice(0, r+1);
|
||||
}
|
||||
}
|
||||
let pg = host_idx.map(h => osds[hosts[h]][select_murmur3(osds[hosts[h]].length, i => count+':o:'+osds[hosts[h]][i])]);
|
||||
while (pg.length < pg_size)
|
||||
{
|
||||
pg.push(NO_OSD);
|
||||
}
|
||||
r['pg_'+pg.join('_')] = pg;
|
||||
count--;
|
||||
}
|
||||
return r;
|
||||
}
|
||||
|
||||
// Super-stupid algorithm. Given the current OSD tree, generate all possible OSD combinations
|
||||
// osd_tree = { failure_domain1: { osd1: size1, ... }, ... }
|
||||
// ordered = return combinations without duplicates having different order
|
||||
function all_combinations(osd_tree, pg_size, ordered, count)
|
||||
{
|
||||
const hosts = Object.keys(osd_tree).sort();
|
||||
const osds = Object.keys(osd_tree).reduce((a, c) => { a[c] = Object.keys(osd_tree[c]).sort(); return a; }, {});
|
||||
while (hosts.length < pg_size)
|
||||
{
|
||||
osds[NO_OSD] = [ NO_OSD ];
|
||||
hosts.push(NO_OSD);
|
||||
}
|
||||
let host_idx = [];
|
||||
let osd_idx = [];
|
||||
for (let i = 0; i < pg_size; i++)
|
||||
{
|
||||
host_idx.push(i);
|
||||
osd_idx.push(0);
|
||||
}
|
||||
const r = [];
|
||||
while (!count || count < 0 || r.length < count)
|
||||
{
|
||||
r.push(host_idx.map((hi, i) => osds[hosts[hi]][osd_idx[i]]));
|
||||
let inc = pg_size-1;
|
||||
while (inc >= 0)
|
||||
{
|
||||
osd_idx[inc]++;
|
||||
if (osd_idx[inc] >= osds[hosts[host_idx[inc]]].length)
|
||||
{
|
||||
osd_idx[inc] = 0;
|
||||
inc--;
|
||||
}
|
||||
else
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (inc < 0)
|
||||
{
|
||||
// no osds left in the current host combination, select the next one
|
||||
inc = pg_size-1;
|
||||
same_again: while (inc >= 0)
|
||||
{
|
||||
host_idx[inc]++;
|
||||
for (let prev_host = 0; prev_host < inc; prev_host++)
|
||||
{
|
||||
if (host_idx[prev_host] == host_idx[inc])
|
||||
{
|
||||
continue same_again;
|
||||
}
|
||||
}
|
||||
if (host_idx[inc] < (ordered ? hosts.length-(pg_size-1-inc) : hosts.length))
|
||||
{
|
||||
while ((++inc) < pg_size)
|
||||
{
|
||||
host_idx[inc] = (ordered ? host_idx[inc-1]+1 : 0);
|
||||
}
|
||||
break;
|
||||
}
|
||||
else
|
||||
{
|
||||
inc--;
|
||||
}
|
||||
}
|
||||
if (inc < 0)
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
return r;
|
||||
}
|
||||
|
||||
function check_combinations(osd_tree, pgs)
|
||||
{
|
||||
const host_per_osd = {};
|
||||
for (const host in osd_tree)
|
||||
{
|
||||
for (const osd in osd_tree[host])
|
||||
{
|
||||
host_per_osd[osd] = host;
|
||||
}
|
||||
}
|
||||
const res = [];
|
||||
skip_pg: for (const pg of pgs)
|
||||
{
|
||||
const seen_hosts = {};
|
||||
for (const osd of pg)
|
||||
{
|
||||
if (!host_per_osd[osd] || seen_hosts[host_per_osd[osd]])
|
||||
{
|
||||
continue skip_pg;
|
||||
}
|
||||
seen_hosts[host_per_osd[osd]] = true;
|
||||
}
|
||||
res.push(pg);
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
function compat(params)
|
||||
{
|
||||
return {
|
||||
...params,
|
||||
osd_weights: Object.assign({}, ...Object.values(params.osd_tree)),
|
||||
combinator: new SimpleCombinator(params.osd_tree, params.pg_size, params.max_combinations||10000),
|
||||
};
|
||||
}
|
||||
|
||||
module.exports = {
|
||||
flatten_tree,
|
||||
all_combinations,
|
||||
SimpleCombinator,
|
||||
compat,
|
||||
NO_OSD,
|
||||
};
|
|
@ -7,6 +7,7 @@
|
|||
// This leads to really uneven OSD fill ratio in Ceph even when PGs are perfectly balanced.
|
||||
// But we support this case with the "parity_space" parameter in optimize_initial()/optimize_change().
|
||||
|
||||
const { SimpleCombinator } = require('./simple_pgs.js');
|
||||
const LPOptimizer = require('./lp-optimizer.js');
|
||||
|
||||
const osd_tree = {
|
||||
|
@ -114,16 +115,17 @@ Fine, let's try to optimize for it.
|
|||
|
||||
async function run()
|
||||
{
|
||||
const all_weights = Object.assign({}, ...Object.values(osd_tree));
|
||||
const total_weight = Object.values(all_weights).reduce((a, c) => Number(a) + Number(c), 0);
|
||||
const eff = LPOptimizer.pg_list_space_efficiency(prev_pgs, all_weights, 2, 2.26);
|
||||
const osd_weights = Object.assign({}, ...Object.values(osd_tree));
|
||||
const total_weight = Object.values(osd_weights).reduce((a, c) => Number(a) + Number(c), 0);
|
||||
const eff = LPOptimizer.pg_list_space_efficiency(prev_pgs, osd_weights, 2, 2.26);
|
||||
const orig = eff*4.26 / total_weight;
|
||||
console.log('Original efficiency was: '+Math.round(orig*10000)/100+' %');
|
||||
|
||||
let prev = await LPOptimizer.optimize_initial({ osd_tree, pg_size: 3, pg_count: 256, parity_space: 2.26 });
|
||||
const combinator = new SimpleCombinator(osd_tree, 3, 10000);
|
||||
let prev = await LPOptimizer.optimize_initial({ osd_weights, combinator, pg_size: 3, pg_count: 256, parity_space: 2.26 });
|
||||
LPOptimizer.print_change_stats(prev);
|
||||
|
||||
let next = await LPOptimizer.optimize_change({ prev_pgs, osd_tree, pg_size: 3, max_combinations: 10000, parity_space: 2.26 });
|
||||
let next = await LPOptimizer.optimize_change({ prev_pgs, osd_weights, combinator, pg_size: 3, parity_space: 2.26 });
|
||||
LPOptimizer.print_change_stats(next);
|
||||
}
|
||||
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
// Copyright (c) Vitaliy Filippov, 2019+
|
||||
// License: VNPL-1.1 (see README.md for details)
|
||||
|
||||
const { compat } = require('./simple_pgs.js');
|
||||
const LPOptimizer = require('./lp-optimizer.js');
|
||||
|
||||
async function run()
|
||||
|
@ -14,26 +15,26 @@ async function run()
|
|||
let res;
|
||||
|
||||
console.log('16 PGs, size=3');
|
||||
res = await LPOptimizer.optimize_initial({ osd_tree, pg_size: 3, pg_count: 16, ordered: false });
|
||||
res = await LPOptimizer.optimize_initial(compat({ osd_tree, pg_size: 3, pg_count: 16, ordered: false }));
|
||||
LPOptimizer.print_change_stats(res, false);
|
||||
assert(res.space == 3, 'Initial distribution');
|
||||
console.log('\nChange size to 2');
|
||||
res = await LPOptimizer.optimize_change({ prev_pgs: res.int_pgs, osd_tree, pg_size: 2, ordered: false });
|
||||
res = await LPOptimizer.optimize_change(compat({ prev_pgs: res.int_pgs, osd_tree, pg_size: 2, ordered: false }));
|
||||
LPOptimizer.print_change_stats(res, false);
|
||||
assert(res.space >= 3*14/16 && res.osd_differs == 0, 'Redistribution');
|
||||
console.log('\nRemove OSD 3');
|
||||
const no3_tree = { ...osd_tree };
|
||||
delete no3_tree['300'];
|
||||
res = await LPOptimizer.optimize_change({ prev_pgs: res.int_pgs, osd_tree: no3_tree, pg_size: 2, ordered: false });
|
||||
res = await LPOptimizer.optimize_change(compat({ prev_pgs: res.int_pgs, osd_tree: no3_tree, pg_size: 2, ordered: false }));
|
||||
LPOptimizer.print_change_stats(res, false);
|
||||
assert(res.space == 2, 'Redistribution after OSD removal');
|
||||
|
||||
console.log('\n16 PGs, size=3, ordered');
|
||||
res = await LPOptimizer.optimize_initial({ osd_tree, pg_size: 3, pg_count: 16, ordered: true });
|
||||
res = await LPOptimizer.optimize_initial(compat({ osd_tree, pg_size: 3, pg_count: 16, ordered: true }));
|
||||
LPOptimizer.print_change_stats(res, false);
|
||||
assert(res.space == 3, 'Initial distribution');
|
||||
console.log('\nChange size to 2, ordered');
|
||||
res = await LPOptimizer.optimize_change({ prev_pgs: res.int_pgs, osd_tree, pg_size: 2, ordered: true });
|
||||
res = await LPOptimizer.optimize_change(compat({ prev_pgs: res.int_pgs, osd_tree, pg_size: 2, ordered: true }));
|
||||
LPOptimizer.print_change_stats(res, false);
|
||||
assert(res.space >= 3*14/16 && res.osd_differs < 8, 'Redistribution');
|
||||
}
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
// Copyright (c) Vitaliy Filippov, 2019+
|
||||
// License: VNPL-1.1 (see README.md for details)
|
||||
|
||||
const { compat, flatten_tree } = require('./simple_pgs.js');
|
||||
const LPOptimizer = require('./lp-optimizer.js');
|
||||
|
||||
const crush_tree = [
|
||||
|
@ -36,44 +37,44 @@ const crush_tree = [
|
|||
] },
|
||||
];
|
||||
|
||||
const osd_tree = LPOptimizer.flatten_tree(crush_tree, {}, 1, 3);
|
||||
const osd_tree = flatten_tree(crush_tree, {}, 1, 3);
|
||||
console.log(osd_tree);
|
||||
|
||||
async function run()
|
||||
{
|
||||
const cur_tree = {};
|
||||
console.log('Empty tree:');
|
||||
let res = await LPOptimizer.optimize_initial({ osd_tree: cur_tree, pg_size: 3, pg_count: 256 });
|
||||
let res = await LPOptimizer.optimize_initial(compat({ osd_tree: cur_tree, pg_size: 3, pg_count: 256 }));
|
||||
LPOptimizer.print_change_stats(res, false);
|
||||
assert(res.space == 0);
|
||||
console.log('\nAdding 1st failure domain:');
|
||||
cur_tree['dom1'] = osd_tree['dom1'];
|
||||
res = await LPOptimizer.optimize_change({ prev_pgs: res.int_pgs, osd_tree: cur_tree, pg_size: 3 });
|
||||
res = await LPOptimizer.optimize_change(compat({ prev_pgs: res.int_pgs, osd_tree: cur_tree, pg_size: 3 }));
|
||||
LPOptimizer.print_change_stats(res, false);
|
||||
assert(res.space == 12 && res.total_space == 12);
|
||||
console.log('\nAdding 2nd failure domain:');
|
||||
cur_tree['dom2'] = osd_tree['dom2'];
|
||||
res = await LPOptimizer.optimize_change({ prev_pgs: res.int_pgs, osd_tree: cur_tree, pg_size: 3 });
|
||||
res = await LPOptimizer.optimize_change(compat({ prev_pgs: res.int_pgs, osd_tree: cur_tree, pg_size: 3 }));
|
||||
LPOptimizer.print_change_stats(res, false);
|
||||
assert(res.space == 24 && res.total_space == 24);
|
||||
console.log('\nAdding 3rd failure domain:');
|
||||
cur_tree['dom3'] = osd_tree['dom3'];
|
||||
res = await LPOptimizer.optimize_change({ prev_pgs: res.int_pgs, osd_tree: cur_tree, pg_size: 3 });
|
||||
res = await LPOptimizer.optimize_change(compat({ prev_pgs: res.int_pgs, osd_tree: cur_tree, pg_size: 3 }));
|
||||
LPOptimizer.print_change_stats(res, false);
|
||||
assert(res.space == 36 && res.total_space == 36);
|
||||
console.log('\nRemoving 3rd failure domain:');
|
||||
delete cur_tree['dom3'];
|
||||
res = await LPOptimizer.optimize_change({ prev_pgs: res.int_pgs, osd_tree: cur_tree, pg_size: 3 });
|
||||
res = await LPOptimizer.optimize_change(compat({ prev_pgs: res.int_pgs, osd_tree: cur_tree, pg_size: 3 }));
|
||||
LPOptimizer.print_change_stats(res, false);
|
||||
assert(res.space == 24 && res.total_space == 24);
|
||||
console.log('\nRemoving 2nd failure domain:');
|
||||
delete cur_tree['dom2'];
|
||||
res = await LPOptimizer.optimize_change({ prev_pgs: res.int_pgs, osd_tree: cur_tree, pg_size: 3 });
|
||||
res = await LPOptimizer.optimize_change(compat({ prev_pgs: res.int_pgs, osd_tree: cur_tree, pg_size: 3 }));
|
||||
LPOptimizer.print_change_stats(res, false);
|
||||
assert(res.space == 12 && res.total_space == 12);
|
||||
console.log('\nRemoving 1st failure domain:');
|
||||
delete cur_tree['dom1'];
|
||||
res = await LPOptimizer.optimize_change({ prev_pgs: res.int_pgs, osd_tree: cur_tree, pg_size: 3 });
|
||||
res = await LPOptimizer.optimize_change(compat({ prev_pgs: res.int_pgs, osd_tree: cur_tree, pg_size: 3 }));
|
||||
LPOptimizer.print_change_stats(res, false);
|
||||
assert(res.space == 0);
|
||||
}
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
// Copyright (c) Vitaliy Filippov, 2019+
|
||||
// License: VNPL-1.1 (see README.md for details)
|
||||
|
||||
const { compat } = require('./simple_pgs.js');
|
||||
const LPOptimizer = require('./lp-optimizer.js');
|
||||
|
||||
const osd_tree = {
|
||||
|
@ -20,13 +21,13 @@ async function run()
|
|||
{
|
||||
let res;
|
||||
console.log('256 PGs, 3+3 OSDs, size=2');
|
||||
res = await LPOptimizer.optimize_initial({ osd_tree, pg_size: 2, pg_count: 256 });
|
||||
res = await LPOptimizer.optimize_initial(compat({ osd_tree, pg_size: 2, pg_count: 256 }));
|
||||
LPOptimizer.print_change_stats(res, false);
|
||||
|
||||
// Should NOT fail with the "unfeasible or unbounded" exception
|
||||
console.log('\nRemoving osd.2');
|
||||
delete osd_tree[100][2];
|
||||
res = await LPOptimizer.optimize_change({ prev_pgs: res.int_pgs, osd_tree, pg_size: 2 });
|
||||
res = await LPOptimizer.optimize_change(compat({ prev_pgs: res.int_pgs, osd_tree, pg_size: 2 }));
|
||||
LPOptimizer.print_change_stats(res, false);
|
||||
}
|
||||
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
// Copyright (c) Vitaliy Filippov, 2019+
|
||||
// License: VNPL-1.1 (see README.md for details)
|
||||
|
||||
const { compat, flatten_tree } = require('./simple_pgs.js');
|
||||
const LPOptimizer = require('./lp-optimizer.js');
|
||||
|
||||
const osd_tree = {
|
||||
|
@ -19,7 +20,7 @@ const osd_tree = {
|
|||
},
|
||||
500: {
|
||||
4: 3.58498,
|
||||
// 8: 3.58589,
|
||||
/*8: 3.58589,*/
|
||||
9: 3.63869,
|
||||
},
|
||||
600: {
|
||||
|
@ -84,31 +85,31 @@ async function run()
|
|||
// Space efficiency is ~99% in all cases.
|
||||
|
||||
console.log('256 PGs, size=2');
|
||||
res = await LPOptimizer.optimize_initial({ osd_tree, pg_size: 2, pg_count: 256 });
|
||||
res = await LPOptimizer.optimize_initial(compat({ osd_tree, pg_size: 2, pg_count: 256 }));
|
||||
LPOptimizer.print_change_stats(res, false);
|
||||
console.log('\nAdding osd.8');
|
||||
osd_tree[500][8] = 3.58589;
|
||||
res = await LPOptimizer.optimize_change({ prev_pgs: res.int_pgs, osd_tree, pg_size: 2 });
|
||||
res = await LPOptimizer.optimize_change(compat({ prev_pgs: res.int_pgs, osd_tree, pg_size: 2 }));
|
||||
LPOptimizer.print_change_stats(res, false);
|
||||
console.log('\nRemoving osd.8');
|
||||
delete osd_tree[500][8];
|
||||
res = await LPOptimizer.optimize_change({ prev_pgs: res.int_pgs, osd_tree, pg_size: 2 });
|
||||
res = await LPOptimizer.optimize_change(compat({ prev_pgs: res.int_pgs, osd_tree, pg_size: 2 }));
|
||||
LPOptimizer.print_change_stats(res, false);
|
||||
|
||||
console.log('\n256 PGs, size=3');
|
||||
res = await LPOptimizer.optimize_initial({ osd_tree, pg_size: 3, pg_count: 256 });
|
||||
res = await LPOptimizer.optimize_initial(compat({ osd_tree, pg_size: 3, pg_count: 256 }));
|
||||
LPOptimizer.print_change_stats(res, false);
|
||||
console.log('\nAdding osd.8');
|
||||
osd_tree[500][8] = 3.58589;
|
||||
res = await LPOptimizer.optimize_change({ prev_pgs: res.int_pgs, osd_tree, pg_size: 3 });
|
||||
res = await LPOptimizer.optimize_change(compat({ prev_pgs: res.int_pgs, osd_tree, pg_size: 3 }));
|
||||
LPOptimizer.print_change_stats(res, false);
|
||||
console.log('\nRemoving osd.8');
|
||||
delete osd_tree[500][8];
|
||||
res = await LPOptimizer.optimize_change({ prev_pgs: res.int_pgs, osd_tree, pg_size: 3 });
|
||||
res = await LPOptimizer.optimize_change(compat({ prev_pgs: res.int_pgs, osd_tree, pg_size: 3 }));
|
||||
LPOptimizer.print_change_stats(res, false);
|
||||
|
||||
console.log('\n256 PGs, size=3, failure domain=rack');
|
||||
res = await LPOptimizer.optimize_initial({ osd_tree: LPOptimizer.flatten_tree(crush_tree, {}, 1, 3), pg_size: 3, pg_count: 256 });
|
||||
res = await LPOptimizer.optimize_initial(compat({ osd_tree: flatten_tree(crush_tree, {}, 1, 3), pg_size: 3, pg_count: 256 }));
|
||||
LPOptimizer.print_change_stats(res, false);
|
||||
}
|
||||
|
||||
|
|
|
@ -0,0 +1,118 @@
|
|||
const { random_custom_combinations, index_tree, parse_level_indexes, parse_pg_dsl } = require('./dsl_pgs.js');
|
||||
|
||||
function check(result, expected)
|
||||
{
|
||||
console.dir(result, { depth: null });
|
||||
if (JSON.stringify(result) !== JSON.stringify(expected))
|
||||
{
|
||||
process.stderr.write('Unexpected value, expected: ');
|
||||
console.dir(expected, { depth: null });
|
||||
process.exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
check(
|
||||
parse_pg_dsl("any, dc=1 host!=1, dc!=1, dc=3 host!=3, dc!=(1,3), dc=5 host!=5"),
|
||||
[
|
||||
[],
|
||||
[ [ 'dc', '=', 1 ], [ 'host', '!=', 1 ] ],
|
||||
[ [ 'dc', '!=', 1 ] ],
|
||||
[ [ 'dc', '=', 3 ], [ 'host', '!=', 3 ] ],
|
||||
[ [ 'dc', '!=', [ 1, 3 ] ] ],
|
||||
[ [ 'dc', '=', 5 ], [ 'host', '!=', 5 ] ],
|
||||
]
|
||||
);
|
||||
|
||||
check(
|
||||
parse_pg_dsl("dc=meow, dc!=1, dc>2"),
|
||||
[
|
||||
[ [ 'dc', '=', { id: 'meow' } ] ],
|
||||
[ [ 'dc', '!=', 1 ] ],
|
||||
[ [ 'dc', '>', 2 ] ],
|
||||
]
|
||||
);
|
||||
|
||||
check(
|
||||
parse_level_indexes({ dc: '112233', host: 'ABCDEF' }),
|
||||
[
|
||||
[],
|
||||
[ [ 'dc', '=', 1 ], [ 'host', '!=', [ 1 ] ] ],
|
||||
[ [ 'dc', '!=', [ 1 ] ], [ 'host', '!=', [ 1, 2 ] ] ],
|
||||
[ [ 'dc', '=', 3 ], [ 'host', '!=', [ 1, 2, 3 ] ] ],
|
||||
[ [ 'dc', '!=', [ 1, 3 ] ], [ 'host', '!=', [ 1, 2, 3, 4 ] ] ],
|
||||
[ [ 'dc', '=', 5 ], [ 'host', '!=', [ 1, 2, 3, 4, 5 ] ] ],
|
||||
]
|
||||
);
|
||||
|
||||
check(
|
||||
parse_level_indexes({ dc: '112233', host: 'ABCDEF' }, [ 'dc', 'host' ]),
|
||||
[
|
||||
[],
|
||||
[ [ 'dc', '=', 1 ], [ 'host', '!=', [ 1 ] ] ],
|
||||
[ [ 'dc', '!=', [ 1 ] ] ],
|
||||
[ [ 'dc', '=', 3 ], [ 'host', '!=', [ 3 ] ] ],
|
||||
[ [ 'dc', '!=', [ 1, 3 ] ] ],
|
||||
[ [ 'dc', '=', 5 ], [ 'host', '!=', [ 5 ] ] ],
|
||||
]
|
||||
);
|
||||
|
||||
check(
|
||||
parse_level_indexes({ dc: '112211223333', host: '123456789ABC' }),
|
||||
[
|
||||
[],
|
||||
[ [ 'dc', '=', 1 ], [ 'host', '!=', [ 1 ] ] ],
|
||||
[ [ 'dc', '!=', [ 1 ] ], [ 'host', '!=', [ 1, 2 ] ] ],
|
||||
[ [ 'dc', '=', 3 ], [ 'host', '!=', [ 1, 2, 3 ] ] ],
|
||||
[ [ 'dc', '=', 1 ], [ 'host', '!=', [ 1, 2, 3, 4 ] ] ],
|
||||
[ [ 'dc', '=', 1 ], [ 'host', '!=', [ 1, 2, 3, 4, 5 ] ] ],
|
||||
[ [ 'dc', '=', 3 ], [ 'host', '!=', [ 1, 2, 3, 4, 5, 6 ] ] ],
|
||||
[ [ 'dc', '=', 3 ], [ 'host', '!=', [ 1, 2, 3, 4, 5, 6, 7 ] ] ],
|
||||
[ [ 'dc', '!=', [ 1, 3 ] ], [ 'host', '!=', [ 1, 2, 3, 4, 5, 6, 7, 8 ] ] ],
|
||||
[ [ 'dc', '=', 9 ], [ 'host', '!=', [ 1, 2, 3, 4, 5, 6, 7, 8, 9 ] ] ],
|
||||
[ [ 'dc', '=', 9 ], [ 'host', '!=', [ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 ] ] ],
|
||||
[ [ 'dc', '=', 9 ], [ 'host', '!=', [ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 ] ] ],
|
||||
]
|
||||
);
|
||||
|
||||
check(
|
||||
parse_level_indexes({ dc: '112211223333', host: '123456789ABC' }, [ 'dc', 'host' ]),
|
||||
[
|
||||
[],
|
||||
[ [ 'dc', '=', 1 ], [ 'host', '!=', [ 1 ] ] ],
|
||||
[ [ 'dc', '!=', [ 1 ] ] ],
|
||||
[ [ 'dc', '=', 3 ], [ 'host', '!=', [ 3 ] ] ],
|
||||
[ [ 'dc', '=', 1 ], [ 'host', '!=', [ 1, 2 ] ] ],
|
||||
[ [ 'dc', '=', 1 ], [ 'host', '!=', [ 1, 2, 5 ] ] ],
|
||||
[ [ 'dc', '=', 3 ], [ 'host', '!=', [ 3, 4 ] ] ],
|
||||
[ [ 'dc', '=', 3 ], [ 'host', '!=', [ 3, 4, 7 ] ] ],
|
||||
[ [ 'dc', '!=', [ 1, 3 ] ] ],
|
||||
[ [ 'dc', '=', 9 ], [ 'host', '!=', [ 9 ] ] ],
|
||||
[ [ 'dc', '=', 9 ], [ 'host', '!=', [ 9, 10 ] ] ],
|
||||
[ [ 'dc', '=', 9 ], [ 'host', '!=', [ 9, 10, 11 ] ] ]
|
||||
]
|
||||
);
|
||||
|
||||
check(
|
||||
Object.keys(random_custom_combinations(index_tree([
|
||||
{ id: '1', size: 1, level: 'osd' },
|
||||
{ id: '2', size: 2, level: 'osd' },
|
||||
{ id: '3', size: 3, level: 'osd' }
|
||||
]), parse_level_indexes({ osd: '12' }), 10000)).sort(),
|
||||
[ 'pg_1_2', 'pg_1_3', 'pg_2_3' ]
|
||||
);
|
||||
|
||||
check(
|
||||
Object.keys(random_custom_combinations(index_tree([
|
||||
{ id: 'h1', level: 'host' },
|
||||
{ id: 'h2', level: 'host' },
|
||||
{ id: 'h3', level: 'host' },
|
||||
{ id: '1', size: 1, level: 'osd', parent: 'h1' },
|
||||
{ id: '2', size: 1, level: 'osd', parent: 'h2' },
|
||||
{ id: '3', size: 1, level: 'osd', parent: 'h2' },
|
||||
{ id: '4', size: 1, level: 'osd', parent: 'h3' },
|
||||
{ id: '5', size: 1, level: 'osd', parent: 'h3' },
|
||||
]), parse_level_indexes({ host: '1122', osd: '1234' }), 10000)).sort(),
|
||||
[ 'pg_2_3_4_5' ]
|
||||
);
|
||||
|
||||
console.log('OK');
|
|
@ -50,7 +50,7 @@ from cinder.volume import configuration
|
|||
from cinder.volume import driver
|
||||
from cinder.volume import volume_utils
|
||||
|
||||
VERSION = '1.5.0'
|
||||
VERSION = '1.6.1'
|
||||
|
||||
LOG = logging.getLogger(__name__)
|
||||
|
||||
|
|
|
@ -24,4 +24,4 @@ rm fio
|
|||
mv fio-copy fio
|
||||
FIO=`rpm -qi fio | perl -e 'while(<>) { /^Epoch[\s:]+(\S+)/ && print "$1:"; /^Version[\s:]+(\S+)/ && print $1; /^Release[\s:]+(\S+)/ && print "-$1"; }'`
|
||||
perl -i -pe 's/(Requires:\s*fio)([^\n]+)?/$1 = '$FIO'/' $VITASTOR/rpm/vitastor-el$EL.spec
|
||||
tar --transform 's#^#vitastor-1.5.0/#' --exclude 'rpm/*.rpm' -czf $VITASTOR/../vitastor-1.5.0$(rpm --eval '%dist').tar.gz *
|
||||
tar --transform 's#^#vitastor-1.6.1/#' --exclude 'rpm/*.rpm' -czf $VITASTOR/../vitastor-1.6.1$(rpm --eval '%dist').tar.gz *
|
||||
|
|
|
@ -10,7 +10,7 @@ RUN rm -f /etc/yum.repos.d/CentOS-Media.repo
|
|||
RUN yum -y --enablerepo=extras install centos-release-scl epel-release yum-utils rpm-build
|
||||
RUN yum -y install https://vitastor.io/rpms/centos/7/vitastor-release-1.0-1.el7.noarch.rpm
|
||||
RUN yum -y install devtoolset-9-gcc-c++ devtoolset-9-libatomic-devel gcc make cmake gperftools-devel \
|
||||
fio rh-nodejs12 jerasure-devel libisa-l-devel gf-complete-devel rdma-core-devel
|
||||
fio rh-nodejs12 jerasure-devel libisa-l-devel gf-complete-devel rdma-core-devel libnl3-devel
|
||||
RUN yumdownloader --disablerepo=centos-sclo-rh --source fio
|
||||
RUN rpm --nomd5 -i fio*.src.rpm
|
||||
RUN rm -f /etc/yum.repos.d/CentOS-Media.repo
|
||||
|
@ -36,7 +36,7 @@ ADD . /root/vitastor
|
|||
RUN set -e; \
|
||||
cd /root/vitastor/rpm; \
|
||||
sh build-tarball.sh; \
|
||||
cp /root/vitastor-1.5.0.el7.tar.gz ~/rpmbuild/SOURCES; \
|
||||
cp /root/vitastor-1.6.1.el7.tar.gz ~/rpmbuild/SOURCES; \
|
||||
cp vitastor-el7.spec ~/rpmbuild/SPECS/vitastor.spec; \
|
||||
cd ~/rpmbuild/SPECS/; \
|
||||
rpmbuild -ba vitastor.spec; \
|
||||
|
|
|
@ -1,11 +1,11 @@
|
|||
Name: vitastor
|
||||
Version: 1.5.0
|
||||
Version: 1.6.1
|
||||
Release: 1%{?dist}
|
||||
Summary: Vitastor, a fast software-defined clustered block storage
|
||||
|
||||
License: Vitastor Network Public License 1.1
|
||||
URL: https://vitastor.io/
|
||||
Source0: vitastor-1.5.0.el7.tar.gz
|
||||
Source0: vitastor-1.6.1.el7.tar.gz
|
||||
|
||||
BuildRequires: liburing-devel >= 0.6
|
||||
BuildRequires: gperftools-devel
|
||||
|
@ -17,6 +17,7 @@ BuildRequires: libisa-l-devel
|
|||
BuildRequires: gf-complete-devel
|
||||
BuildRequires: libibverbs-devel
|
||||
BuildRequires: cmake3
|
||||
BuildRequires: libnl3-devel
|
||||
Requires: vitastor-osd = %{version}-%{release}
|
||||
Requires: vitastor-mon = %{version}-%{release}
|
||||
Requires: vitastor-client = %{version}-%{release}
|
||||
|
@ -103,7 +104,7 @@ rm -rf $RPM_BUILD_ROOT
|
|||
%make_install
|
||||
. /opt/rh/rh-nodejs12/enable
|
||||
cd mon
|
||||
npm install
|
||||
npm install --production
|
||||
cd ..
|
||||
mkdir -p %buildroot/usr/lib/vitastor
|
||||
cp -r mon %buildroot/usr/lib/vitastor
|
||||
|
|
|
@ -11,7 +11,7 @@ RUN dnf -y install centos-release-advanced-virtualization epel-release dnf-plugi
|
|||
RUN sed -i 's/^mirrorlist=/#mirrorlist=/; s!#baseurl=.*!baseurl=http://vault.centos.org/centos/8.4.2105/virt/$basearch/$avdir/!; s!^baseurl=.*Source/.*!baseurl=http://vault.centos.org/centos/8.4.2105/virt/Source/advanced-virtualization/!' /etc/yum.repos.d/CentOS-Advanced-Virtualization.repo
|
||||
RUN yum -y install https://vitastor.io/rpms/centos/8/vitastor-release-1.0-1.el8.noarch.rpm
|
||||
RUN dnf -y install gcc-toolset-9 gcc-toolset-9-gcc-c++ gperftools-devel \
|
||||
fio nodejs rpm-build jerasure-devel libisa-l-devel gf-complete-devel libibverbs-devel libarchive cmake
|
||||
fio nodejs rpm-build jerasure-devel libisa-l-devel gf-complete-devel libibverbs-devel libarchive cmake libnl3-devel
|
||||
RUN dnf download --source fio
|
||||
RUN rpm --nomd5 -i fio*.src.rpm
|
||||
RUN cd ~/rpmbuild/SPECS && dnf builddep -y --enablerepo=powertools --spec fio.spec
|
||||
|
@ -35,7 +35,7 @@ ADD . /root/vitastor
|
|||
RUN set -e; \
|
||||
cd /root/vitastor/rpm; \
|
||||
sh build-tarball.sh; \
|
||||
cp /root/vitastor-1.5.0.el8.tar.gz ~/rpmbuild/SOURCES; \
|
||||
cp /root/vitastor-1.6.1.el8.tar.gz ~/rpmbuild/SOURCES; \
|
||||
cp vitastor-el8.spec ~/rpmbuild/SPECS/vitastor.spec; \
|
||||
cd ~/rpmbuild/SPECS/; \
|
||||
rpmbuild -ba vitastor.spec; \
|
||||
|
|
|
@ -1,11 +1,11 @@
|
|||
Name: vitastor
|
||||
Version: 1.5.0
|
||||
Version: 1.6.1
|
||||
Release: 1%{?dist}
|
||||
Summary: Vitastor, a fast software-defined clustered block storage
|
||||
|
||||
License: Vitastor Network Public License 1.1
|
||||
URL: https://vitastor.io/
|
||||
Source0: vitastor-1.5.0.el8.tar.gz
|
||||
Source0: vitastor-1.6.1.el8.tar.gz
|
||||
|
||||
BuildRequires: liburing-devel >= 0.6
|
||||
BuildRequires: gperftools-devel
|
||||
|
@ -16,6 +16,7 @@ BuildRequires: libisa-l-devel
|
|||
BuildRequires: gf-complete-devel
|
||||
BuildRequires: libibverbs-devel
|
||||
BuildRequires: cmake
|
||||
BuildRequires: libnl3-devel
|
||||
Requires: vitastor-osd = %{version}-%{release}
|
||||
Requires: vitastor-mon = %{version}-%{release}
|
||||
Requires: vitastor-client = %{version}-%{release}
|
||||
|
@ -100,7 +101,7 @@ Vitastor fio drivers for benchmarking.
|
|||
rm -rf $RPM_BUILD_ROOT
|
||||
%make_install
|
||||
cd mon
|
||||
npm install
|
||||
npm install --production
|
||||
cd ..
|
||||
mkdir -p %buildroot/usr/lib/vitastor
|
||||
cp -r mon %buildroot/usr/lib/vitastor
|
||||
|
|
|
@ -8,7 +8,7 @@ WORKDIR /root
|
|||
RUN sed -i 's/enabled=0/enabled=1/' /etc/yum.repos.d/*.repo
|
||||
RUN dnf -y install epel-release dnf-plugins-core
|
||||
RUN dnf -y install https://vitastor.io/rpms/centos/9/vitastor-release-1.0-1.el9.noarch.rpm
|
||||
RUN dnf -y install gcc-c++ gperftools-devel fio nodejs rpm-build jerasure-devel libisa-l-devel gf-complete-devel rdma-core-devel libarchive liburing-devel cmake
|
||||
RUN dnf -y install gcc-c++ gperftools-devel fio nodejs rpm-build jerasure-devel libisa-l-devel gf-complete-devel rdma-core-devel libarchive liburing-devel cmake libnl3-devel
|
||||
RUN dnf download --source fio
|
||||
RUN rpm --nomd5 -i fio*.src.rpm
|
||||
RUN cd ~/rpmbuild/SPECS && dnf builddep -y --spec fio.spec
|
||||
|
@ -18,7 +18,7 @@ ADD . /root/vitastor
|
|||
RUN set -e; \
|
||||
cd /root/vitastor/rpm; \
|
||||
sh build-tarball.sh; \
|
||||
cp /root/vitastor-1.5.0.el9.tar.gz ~/rpmbuild/SOURCES; \
|
||||
cp /root/vitastor-1.6.1.el9.tar.gz ~/rpmbuild/SOURCES; \
|
||||
cp vitastor-el9.spec ~/rpmbuild/SPECS/vitastor.spec; \
|
||||
cd ~/rpmbuild/SPECS/; \
|
||||
rpmbuild -ba vitastor.spec; \
|
||||
|
|
|
@ -1,11 +1,11 @@
|
|||
Name: vitastor
|
||||
Version: 1.5.0
|
||||
Version: 1.6.1
|
||||
Release: 1%{?dist}
|
||||
Summary: Vitastor, a fast software-defined clustered block storage
|
||||
|
||||
License: Vitastor Network Public License 1.1
|
||||
URL: https://vitastor.io/
|
||||
Source0: vitastor-1.5.0.el9.tar.gz
|
||||
Source0: vitastor-1.6.1.el9.tar.gz
|
||||
|
||||
BuildRequires: liburing-devel >= 0.6
|
||||
BuildRequires: gperftools-devel
|
||||
|
@ -16,6 +16,7 @@ BuildRequires: libisa-l-devel
|
|||
BuildRequires: gf-complete-devel
|
||||
BuildRequires: rdma-core-devel
|
||||
BuildRequires: cmake
|
||||
BuildRequires: libnl3-devel
|
||||
Requires: vitastor-osd = %{version}-%{release}
|
||||
Requires: vitastor-mon = %{version}-%{release}
|
||||
Requires: vitastor-client = %{version}-%{release}
|
||||
|
@ -93,7 +94,7 @@ Vitastor fio drivers for benchmarking.
|
|||
rm -rf $RPM_BUILD_ROOT
|
||||
%cmake_install
|
||||
cd mon
|
||||
npm install
|
||||
npm install --production
|
||||
cd ..
|
||||
mkdir -p %buildroot/usr/lib/vitastor
|
||||
cp -r mon %buildroot/usr/lib/vitastor
|
||||
|
|
|
@ -4,6 +4,9 @@ project(vitastor)
|
|||
|
||||
include(GNUInstallDirs)
|
||||
include(CTest)
|
||||
include(CheckIncludeFile)
|
||||
|
||||
find_package(PkgConfig)
|
||||
|
||||
set(WITH_QEMU false CACHE BOOL "Build QEMU driver inside Vitastor source tree")
|
||||
set(WITH_FIO true CACHE BOOL "Build FIO driver")
|
||||
|
@ -16,7 +19,7 @@ if("${CMAKE_INSTALL_PREFIX}" MATCHES "^/usr/local/?$")
|
|||
set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}")
|
||||
endif()
|
||||
|
||||
add_definitions(-DVERSION="1.5.0")
|
||||
add_definitions(-DVERSION="1.6.1")
|
||||
add_definitions(-D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64 -Wall -Wno-sign-compare -Wno-comment -Wno-parentheses -Wno-pointer-arith -fdiagnostics-color=always -fno-omit-frame-pointer -I ${CMAKE_SOURCE_DIR}/src)
|
||||
add_link_options(-fno-omit-frame-pointer)
|
||||
if (${WITH_ASAN})
|
||||
|
@ -25,15 +28,15 @@ if (${WITH_ASAN})
|
|||
endif (${WITH_ASAN})
|
||||
|
||||
set(CMAKE_BUILD_TYPE RelWithDebInfo)
|
||||
string(REGEX REPLACE "([\\/\\-]O)[12]?" "\\13" CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE}")
|
||||
string(REGEX REPLACE "([\\/\\-]O)[12]?" "\\13" CMAKE_CXX_FLAGS_MINSIZEREL "${CMAKE_CXX_FLAGS_MINSIZEREL}")
|
||||
string(REGEX REPLACE "([\\/\\-]O)[12]?" "\\13" CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO}")
|
||||
string(REGEX REPLACE "([\\/\\-]O)[^ \t\r\n]*" "\\13" CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE}")
|
||||
string(REGEX REPLACE "([\\/\\-]O)[^ \t\r\n]*" "\\13" CMAKE_CXX_FLAGS_MINSIZEREL "${CMAKE_CXX_FLAGS_MINSIZEREL}")
|
||||
string(REGEX REPLACE "([\\/\\-]O)[^ \t\r\n]*" "\\13" CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO}")
|
||||
string(REGEX REPLACE "([\\/\\-]D) *NDEBUG" "" CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE}")
|
||||
string(REGEX REPLACE "([\\/\\-]D) *NDEBUG" "" CMAKE_CXX_FLAGS_MINSIZEREL "${CMAKE_CXX_FLAGS_MINSIZEREL}")
|
||||
string(REGEX REPLACE "([\\/\\-]D) *NDEBUG" "" CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO}")
|
||||
string(REGEX REPLACE "([\\/\\-]O)[12]?" "\\13" CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE}")
|
||||
string(REGEX REPLACE "([\\/\\-]O)[12]?" "\\13" CMAKE_C_FLAGS_MINSIZEREL "${CMAKE_C_FLAGS_MINSIZEREL}")
|
||||
string(REGEX REPLACE "([\\/\\-]O)[12]?" "\\13" CMAKE_C_FLAGS_RELWITHDEBINFO "${CMAKE_C_FLAGS_RELWITHDEBINFO}")
|
||||
string(REGEX REPLACE "([\\/\\-]O)[^ \t\r\n]*" "\\13" CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE}")
|
||||
string(REGEX REPLACE "([\\/\\-]O)[^ \t\r\n]*" "\\13" CMAKE_C_FLAGS_MINSIZEREL "${CMAKE_C_FLAGS_MINSIZEREL}")
|
||||
string(REGEX REPLACE "([\\/\\-]O)[^ \t\r\n]*" "\\13" CMAKE_C_FLAGS_RELWITHDEBINFO "${CMAKE_C_FLAGS_RELWITHDEBINFO}")
|
||||
string(REGEX REPLACE "([\\/\\-]D) *NDEBUG" "" CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE}")
|
||||
string(REGEX REPLACE "([\\/\\-]D) *NDEBUG" "" CMAKE_C_FLAGS_MINSIZEREL "${CMAKE_C_FLAGS_MINSIZEREL}")
|
||||
string(REGEX REPLACE "([\\/\\-]D) *NDEBUG" "" CMAKE_C_FLAGS_RELWITHDEBINFO "${CMAKE_C_FLAGS_RELWITHDEBINFO}")
|
||||
|
@ -43,6 +46,8 @@ macro(install_symlink filepath sympath)
|
|||
install(CODE "message(\"-- Created symlink: ${sympath} -> ${filepath}\")")
|
||||
endmacro(install_symlink)
|
||||
|
||||
check_include_file("linux/nbd-netlink.h" HAVE_NBD_NETLINK_H)
|
||||
|
||||
find_package(PkgConfig)
|
||||
pkg_check_modules(LIBURING REQUIRED liburing)
|
||||
if (${WITH_QEMU})
|
||||
|
@ -178,12 +183,15 @@ if (${WITH_FIO})
|
|||
endif (${WITH_FIO})
|
||||
|
||||
# vitastor-nbd
|
||||
pkg_check_modules(NL3 libnl-3.0 libnl-genl-3.0)
|
||||
add_executable(vitastor-nbd
|
||||
nbd_proxy.cpp
|
||||
)
|
||||
target_link_libraries(vitastor-nbd
|
||||
vitastor_client
|
||||
)
|
||||
target_include_directories(vitastor-nbd PUBLIC ${NL3_INCLUDE_DIRS})
|
||||
target_link_libraries(vitastor-nbd vitastor_client ${NL3_LIBRARIES})
|
||||
if (HAVE_NBD_NETLINK_H AND NL3_LIBRARIES)
|
||||
target_compile_definitions(vitastor-nbd PUBLIC HAVE_NBD_NETLINK_H)
|
||||
endif (HAVE_NBD_NETLINK_H AND NL3_LIBRARIES)
|
||||
|
||||
# libvitastor_kv.so
|
||||
add_library(vitastor_kv SHARED
|
||||
|
|
|
@ -86,6 +86,8 @@ void journal_flusher_t::loop()
|
|||
cur_flusher_count--;
|
||||
}
|
||||
}
|
||||
if (trim_wanted)
|
||||
co[0].try_trim = true;
|
||||
for (int i = 0; (active_flushers > 0 || dequeuing || trim_wanted > 0) && i < cur_flusher_count; i++)
|
||||
co[i].loop();
|
||||
}
|
||||
|
@ -364,10 +366,10 @@ resume_0:
|
|||
!flusher->flush_queue.size() || !flusher->dequeuing)
|
||||
{
|
||||
stop_flusher:
|
||||
if (flusher->trim_wanted > 0 && cur.oid.inode != 0)
|
||||
if (flusher->trim_wanted > 0 && try_trim)
|
||||
{
|
||||
// Attempt forced trim
|
||||
cur.oid = {};
|
||||
try_trim = false;
|
||||
flusher->active_flushers++;
|
||||
goto trim_journal;
|
||||
}
|
||||
|
@ -375,6 +377,7 @@ stop_flusher:
|
|||
wait_state = 0;
|
||||
return true;
|
||||
}
|
||||
try_trim = true;
|
||||
cur.oid = flusher->flush_queue.front();
|
||||
cur.version = flusher->flush_versions[cur.oid];
|
||||
flusher->flush_queue.pop_front();
|
||||
|
|
|
@ -60,6 +60,7 @@ class journal_flusher_co
|
|||
std::map<object_id, uint64_t>::iterator repeat_it;
|
||||
std::function<void(ring_data_t*)> simple_callback_r, simple_callback_rj, simple_callback_w;
|
||||
|
||||
bool try_trim = false;
|
||||
bool skip_copy, has_delete, has_writes;
|
||||
std::vector<copy_buffer_t> v;
|
||||
std::vector<copy_buffer_t>::iterator it;
|
||||
|
|
|
@ -177,6 +177,7 @@ void blockstore_impl_t::prepare_journal_sector_write(int cur_sector, blockstore_
|
|||
ring_data_t *data = ((ring_data_t*)sqe->user_data);
|
||||
journal.sector_info[cur_sector].written = true;
|
||||
journal.sector_info[cur_sector].submit_id = ++journal.submit_id;
|
||||
assert(journal.submit_id != 0); // check overflow
|
||||
journal.submitting_sectors.push_back(cur_sector);
|
||||
journal.sector_info[cur_sector].flush_count++;
|
||||
data->iov = (struct iovec){
|
||||
|
@ -192,8 +193,8 @@ void blockstore_impl_t::prepare_journal_sector_write(int cur_sector, blockstore_
|
|||
}
|
||||
journal.sector_info[cur_sector].dirty = false;
|
||||
// But always remember that this operation has to wait until this exact journal write is finished
|
||||
journal.flushing_ops.insert((pending_journaling_t){
|
||||
.flush_id = journal.sector_info[cur_sector].submit_id,
|
||||
journal.flushing_ops.emplace(journal.sector_info[cur_sector].submit_id, (pending_journaling_t){
|
||||
.pending = 1,
|
||||
.sector = cur_sector,
|
||||
.op = op,
|
||||
});
|
||||
|
@ -213,23 +214,43 @@ void blockstore_impl_t::handle_journal_write(ring_data_t *data, uint64_t flush_i
|
|||
// FIXME: our state becomes corrupted after a write error. maybe do something better than just die
|
||||
disk_error_abort("journal write", data->res, data->iov.iov_len);
|
||||
}
|
||||
auto fl_it = journal.flushing_ops.upper_bound((pending_journaling_t){ .flush_id = flush_id });
|
||||
if (fl_it != journal.flushing_ops.end() && fl_it->flush_id == flush_id)
|
||||
auto fl_it = journal.flushing_ops.lower_bound(flush_id);
|
||||
if (fl_it != journal.flushing_ops.end() && fl_it->first == flush_id && fl_it->second.sector >= 0)
|
||||
{
|
||||
journal.sector_info[fl_it->sector].flush_count--;
|
||||
journal.sector_info[fl_it->second.sector].flush_count--;
|
||||
}
|
||||
while (fl_it != journal.flushing_ops.end() && fl_it->flush_id == flush_id)
|
||||
auto is_first = fl_it == journal.flushing_ops.begin();
|
||||
while (fl_it != journal.flushing_ops.end())
|
||||
{
|
||||
auto priv = PRIV(fl_it->op);
|
||||
priv->pending_ops--;
|
||||
assert(priv->pending_ops >= 0);
|
||||
if (priv->pending_ops == 0)
|
||||
bool del = false;
|
||||
if (fl_it->first == flush_id)
|
||||
{
|
||||
release_journal_sectors(fl_it->op);
|
||||
priv->op_state++;
|
||||
ringloop->wakeup();
|
||||
fl_it->second.pending = 0;
|
||||
del = is_first;
|
||||
}
|
||||
else
|
||||
{
|
||||
del = !fl_it->second.pending;
|
||||
}
|
||||
if (del)
|
||||
{
|
||||
// Do not complete this operation if previous writes are unfinished
|
||||
// Otherwise also complete following operations waiting for this one
|
||||
auto priv = PRIV(fl_it->second.op);
|
||||
priv->pending_ops--;
|
||||
assert(priv->pending_ops >= 0);
|
||||
if (priv->pending_ops == 0)
|
||||
{
|
||||
release_journal_sectors(fl_it->second.op);
|
||||
priv->op_state++;
|
||||
ringloop->wakeup();
|
||||
}
|
||||
journal.flushing_ops.erase(fl_it++);
|
||||
}
|
||||
else
|
||||
{
|
||||
fl_it++;
|
||||
}
|
||||
journal.flushing_ops.erase(fl_it++);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -156,16 +156,11 @@ struct journal_sector_info_t
|
|||
|
||||
struct pending_journaling_t
|
||||
{
|
||||
uint64_t flush_id;
|
||||
int pending;
|
||||
int sector;
|
||||
blockstore_op_t *op;
|
||||
};
|
||||
|
||||
inline bool operator < (const pending_journaling_t & a, const pending_journaling_t & b)
|
||||
{
|
||||
return a.flush_id < b.flush_id || a.flush_id == b.flush_id && a.op < b.op;
|
||||
}
|
||||
|
||||
struct journal_t
|
||||
{
|
||||
int fd;
|
||||
|
@ -191,7 +186,7 @@ struct journal_t
|
|||
int cur_sector = 0;
|
||||
int in_sector_pos = 0;
|
||||
std::vector<int> submitting_sectors;
|
||||
std::set<pending_journaling_t> flushing_ops;
|
||||
std::multimap<uint64_t, pending_journaling_t> flushing_ops;
|
||||
uint64_t submit_id = 0;
|
||||
|
||||
// Used sector map
|
||||
|
|
|
@ -427,7 +427,6 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
|
|||
);
|
||||
write_iodepth++;
|
||||
// Got SQEs. Prepare previous journal sector write if required
|
||||
auto cb = [this, op](ring_data_t *data) { handle_write_event(data, op); };
|
||||
if (immediate_commit == IMMEDIATE_NONE &&
|
||||
!journal.entry_fits(sizeof(journal_entry_small_write) + dyn_size))
|
||||
{
|
||||
|
@ -503,7 +502,15 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
|
|||
}
|
||||
BS_SUBMIT_GET_SQE(sqe2, data2);
|
||||
data2->iov = (struct iovec){ op->buf, op->len };
|
||||
data2->callback = cb;
|
||||
++journal.submit_id;
|
||||
assert(journal.submit_id != 0); // check overflow
|
||||
// Make subsequent journal writes wait for our data write
|
||||
journal.flushing_ops.emplace(journal.submit_id, (pending_journaling_t){
|
||||
.pending = 1,
|
||||
.sector = -1,
|
||||
.op = op,
|
||||
});
|
||||
data2->callback = [this, flush_id = journal.submit_id](ring_data_t *data) { handle_journal_write(data, flush_id); };
|
||||
my_uring_prep_writev(
|
||||
sqe2, dsk.journal_fd, &data2->iov, 1, journal.offset + journal.next_free
|
||||
);
|
||||
|
|
|
@ -76,8 +76,8 @@ static const char* help_text =
|
|||
" <to> must be a child of <from> and <target> may be one of the layers between\n"
|
||||
" <from> and <to>, including <from> and <to>.\n"
|
||||
"\n"
|
||||
"vitastor-cli describe [--osds <osds>] [--object-state <states>] [--pool <pool>] [--inode <ino>] [--min-inode <ino>] [--max-inode <ino>] [--min-offset <offset>] [--max-offset <offset>]\n"
|
||||
" Describe unclean object locations in the cluster.\n"
|
||||
"vitastor-cli describe [OPTIONS]\n"
|
||||
" Describe unclean object locations in the cluster. Options:\n"
|
||||
" --osds <osds>\n"
|
||||
" Only list objects from primary OSD(s) <osds>.\n"
|
||||
" --object-state <states>\n"
|
||||
|
@ -85,6 +85,8 @@ static const char* help_text =
|
|||
" degraded, misplaced, incomplete, corrupted, inconsistent.\n"
|
||||
" --pool <pool name or number>\n"
|
||||
" Only list objects in the given pool.\n"
|
||||
" --pg <pg number>\n"
|
||||
" Only list objects in the given PG of the pool.\n"
|
||||
" --inode, --min-inode, --max-inode\n"
|
||||
" Restrict listing to specific inode numbers.\n"
|
||||
" --min-offset, --max-offset\n"
|
||||
|
@ -129,6 +131,8 @@ static const char* help_text =
|
|||
" --block_size 128k Put pool only on OSDs with this data block size\n"
|
||||
" --bitmap_granularity 4k Put pool only on OSDs with this logical sector size\n"
|
||||
" --immediate_commit none Put pool only on OSDs with this or larger immediate_commit (none < small < all)\n"
|
||||
" --level_placement <rules> Use additional failure domain rules (example: \"dc=112233\")\n"
|
||||
" --raw_placement <rules> Specify raw PG generation rules (see documentation for details)\n"
|
||||
" --primary_affinity_tags tags Prefer to put primary copies on OSDs with all specified tags\n"
|
||||
" --scrub_interval <time> Enable regular scrubbing for this pool. Format: number + unit s/m/h/d/M/y\n"
|
||||
" --used_for_fs <name> Mark pool as used for VitastorFS with metadata in image <name>\n"
|
||||
|
@ -145,6 +149,7 @@ static const char* help_text =
|
|||
" [-s|--pg_size <number>] [--pg_minsize <number>] [-n|--pg_count <count>]\n"
|
||||
" [--failure_domain <level>] [--root_node <node>] [--osd_tags <tags>] [--used_for_fs <name>]\n"
|
||||
" [--max_osd_combinations <number>] [--primary_affinity_tags <tags>] [--scrub_interval <time>]\n"
|
||||
" [--level_placement <rules>] [--raw_placement <rules>]\n"
|
||||
" Non-modifiable parameters (changing them WILL lead to data loss):\n"
|
||||
" [--block_size <size>] [--bitmap_granularity <size>]\n"
|
||||
" [--immediate_commit <all|small|none>] [--pg_stripe_size <size>]\n"
|
||||
|
|
|
@ -37,6 +37,7 @@ struct cli_describe_t
|
|||
{
|
||||
uint64_t object_state = 0;
|
||||
pool_id_t only_pool = 0;
|
||||
pg_num_t only_pg = 0;
|
||||
std::vector<uint64_t> only_osds;
|
||||
uint64_t min_inode = 0, max_inode = 0;
|
||||
uint64_t min_offset = 0, max_offset = 0;
|
||||
|
@ -68,6 +69,7 @@ struct cli_describe_t
|
|||
}
|
||||
}
|
||||
}
|
||||
only_pg = cfg["pg"].uint64_value();
|
||||
min_inode = cfg["inode"].uint64_value();
|
||||
if (min_inode)
|
||||
{
|
||||
|
@ -142,8 +144,8 @@ struct cli_describe_t
|
|||
{
|
||||
osd_op_t *op = new osd_op_t;
|
||||
op->req = (osd_any_op_t){
|
||||
.describe = {
|
||||
.header = {
|
||||
.describe = (osd_op_describe_t){
|
||||
.header = (osd_op_header_t){
|
||||
.magic = SECONDARY_OSD_OP_MAGIC,
|
||||
.id = parent->cli->next_op_id(),
|
||||
.opcode = OSD_OP_DESCRIBE,
|
||||
|
@ -153,6 +155,8 @@ struct cli_describe_t
|
|||
.min_offset = min_offset,
|
||||
.max_inode = max_inode,
|
||||
.max_offset = max_offset,
|
||||
.pool_id = only_pool,
|
||||
.pg_num = only_pg,
|
||||
},
|
||||
};
|
||||
op->callback = [this, osd_num = only_osds[i]](osd_op_t *op)
|
||||
|
@ -182,7 +186,7 @@ struct cli_describe_t
|
|||
printf(
|
||||
(parent->json_output
|
||||
? (count > 0 ? ",\n " FMT : " " FMT)
|
||||
: "%jx:%jx part %u on OSD %ju%s%s%s\n"),
|
||||
: "0x%jx:0x%jx part %u on OSD %ju%s%s%s\n"),
|
||||
#undef FMT
|
||||
items[i].inode, items[i].stripe,
|
||||
items[i].role, items[i].osd_num,
|
||||
|
|
|
@ -71,7 +71,7 @@ std::string validate_pool_config(json11::Json::object & new_cfg, json11::Json ol
|
|||
auto & key = kv_it->first;
|
||||
auto & value = kv_it->second;
|
||||
if (key == "pg_size" || key == "parity_chunks" || key == "pg_minsize" ||
|
||||
key == "pg_count" || key == "max_osd_combinations" || key == "block_size" ||
|
||||
key == "pg_count" || key == "max_osd_combinations" ||
|
||||
key == "bitmap_granularity" || key == "pg_stripe_size")
|
||||
{
|
||||
if (value.is_number() && value.uint64_value() != value.number_value() ||
|
||||
|
@ -81,10 +81,47 @@ std::string validate_pool_config(json11::Json::object & new_cfg, json11::Json ol
|
|||
}
|
||||
value = value.uint64_value();
|
||||
}
|
||||
else if (key == "name" || key == "scheme" || key == "immediate_commit" ||
|
||||
key == "failure_domain" || key == "root_node" || key == "scrub_interval" || key == "used_for_fs")
|
||||
else if (key == "block_size")
|
||||
{
|
||||
// OK
|
||||
value = value.is_string() ? parse_size(value.string_value()) : value.uint64_value();
|
||||
if (!value)
|
||||
{
|
||||
return key+" must be an integer with or without size suffix (K/M/G/T)";
|
||||
}
|
||||
}
|
||||
else if (key == "name" || key == "scheme" || key == "immediate_commit" ||
|
||||
key == "failure_domain" || key == "root_node" || key == "scrub_interval" || key == "used_for_fs" ||
|
||||
key == "raw_placement")
|
||||
{
|
||||
if (!value.is_string())
|
||||
{
|
||||
return key+" must be a string";
|
||||
}
|
||||
}
|
||||
else if (key == "level_placement")
|
||||
{
|
||||
// level=rule, level=rule, ...
|
||||
if (!value.is_object())
|
||||
{
|
||||
json11::Json::object obj;
|
||||
for (auto & item: explode(",", value.string_value(), true))
|
||||
{
|
||||
auto pair = explode("=", item, true);
|
||||
if (pair.size() >= 2)
|
||||
{
|
||||
obj[pair[0]] = pair[1];
|
||||
}
|
||||
}
|
||||
if (obj.size())
|
||||
{
|
||||
value = obj;
|
||||
}
|
||||
else
|
||||
{
|
||||
new_cfg.erase(kv_it++);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
else if (key == "osd_tags" || key == "primary_affinity_tags")
|
||||
{
|
||||
|
@ -184,6 +221,38 @@ std::string validate_pool_config(json11::Json::object & new_cfg, json11::Json ol
|
|||
return "PG size can't be greater than 256";
|
||||
}
|
||||
|
||||
// PG rules
|
||||
if (!cfg["level_placement"].is_null())
|
||||
{
|
||||
for (auto & lr: cfg["level_placement"].object_items())
|
||||
{
|
||||
int len = 0;
|
||||
if (lr.second.is_array())
|
||||
{
|
||||
for (auto & lri: lr.second.array_items())
|
||||
{
|
||||
if (!lri.is_string() && !lri.is_number())
|
||||
{
|
||||
return "--level_placement contains an array with non-scalar value: "+lri.dump();
|
||||
}
|
||||
}
|
||||
len = lr.second.array_items().size();
|
||||
}
|
||||
else if (!lr.second.is_string())
|
||||
{
|
||||
return "--level_placement contains a non-array and non-string value: "+lr.second.dump();
|
||||
}
|
||||
else
|
||||
{
|
||||
len = lr.second.string_value().size();
|
||||
}
|
||||
if (len != pg_size)
|
||||
{
|
||||
return "values in --level_placement should be exactly pg_size ("+std::to_string(pg_size)+") long";
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// parity_chunks
|
||||
uint64_t parity_chunks = 1;
|
||||
if (scheme == POOL_SCHEME_EC)
|
||||
|
|
|
@ -14,7 +14,7 @@
|
|||
struct pool_creator_t
|
||||
{
|
||||
cli_tool_t *parent;
|
||||
json11::Json::object cfg;
|
||||
json11::Json cfg;
|
||||
|
||||
bool force = false;
|
||||
bool wait = false;
|
||||
|
@ -55,8 +55,12 @@ struct pool_creator_t
|
|||
goto resume_8;
|
||||
|
||||
// Validate pool parameters
|
||||
result.text = validate_pool_config(cfg, json11::Json(), parent->cli->st_cli.global_block_size,
|
||||
parent->cli->st_cli.global_bitmap_granularity, force);
|
||||
{
|
||||
auto new_cfg = cfg.object_items();
|
||||
result.text = validate_pool_config(new_cfg, json11::Json(), parent->cli->st_cli.global_block_size,
|
||||
parent->cli->st_cli.global_bitmap_granularity, force);
|
||||
cfg = new_cfg;
|
||||
}
|
||||
if (result.text != "")
|
||||
{
|
||||
result.err = EINVAL;
|
||||
|
@ -605,7 +609,7 @@ std::function<bool(cli_result_t &)> cli_tool_t::start_pool_create(json11::Json c
|
|||
{
|
||||
auto pool_creator = new pool_creator_t();
|
||||
pool_creator->parent = this;
|
||||
pool_creator->cfg = cfg.object_items();
|
||||
pool_creator->cfg = cfg;
|
||||
pool_creator->force = cfg["force"].bool_value();
|
||||
pool_creator->wait = cfg["wait"].bool_value();
|
||||
return [pool_creator](cli_result_t & result)
|
||||
|
|
|
@ -205,7 +205,7 @@ resume_1:
|
|||
{ "raw_to_usable", pool_stats[pool_cfg.id]["raw_to_usable"].number_value() },
|
||||
{ "space_efficiency", pool_stats[pool_cfg.id]["space_efficiency"].number_value() },
|
||||
{ "pg_real_size", pool_stats[pool_cfg.id]["pg_real_size"].uint64_value() },
|
||||
{ "osd_count", pg_per_osd.size() },
|
||||
{ "osd_count", (uint64_t)pg_per_osd.size() },
|
||||
};
|
||||
}
|
||||
// Include full pool config
|
||||
|
|
|
@ -110,6 +110,12 @@ resume_2:
|
|||
}
|
||||
}
|
||||
int mon_count = 0;
|
||||
int osds_full = 0, osds_nearfull = 0;
|
||||
double osd_nearfull_ratio = parent->cli->config["osd_nearfull_ratio"].number_value();
|
||||
if (!osd_nearfull_ratio)
|
||||
{
|
||||
osd_nearfull_ratio = 0.95;
|
||||
}
|
||||
std::string mon_master;
|
||||
for (int i = 0; i < mon_members.size(); i++)
|
||||
{
|
||||
|
@ -139,8 +145,18 @@ resume_2:
|
|||
continue;
|
||||
}
|
||||
osd_count++;
|
||||
total_raw += kv.value["size"].uint64_value();
|
||||
free_raw += kv.value["free"].uint64_value();
|
||||
auto osd_size = kv.value["size"].uint64_value();
|
||||
auto osd_free = kv.value["free"].uint64_value();
|
||||
total_raw += osd_size;
|
||||
free_raw += osd_free;
|
||||
if (!osd_free)
|
||||
{
|
||||
osds_full++;
|
||||
}
|
||||
else if (osd_free < (uint64_t)(osd_size*(1-osd_nearfull_ratio)))
|
||||
{
|
||||
osds_nearfull++;
|
||||
}
|
||||
auto peer_it = parent->cli->st_cli.peer_states.find(stat_osd_num);
|
||||
if (peer_it != parent->cli->st_cli.peer_states.end())
|
||||
{
|
||||
|
@ -281,11 +297,27 @@ resume_2:
|
|||
else if (no_scrub)
|
||||
recovery_io += " scrub: "+str_repeat(" ", io_indent+1)+"disabled\n";
|
||||
}
|
||||
std::string warning_str;
|
||||
if (osds_full)
|
||||
{
|
||||
warning_str += " "+std::to_string(osds_full)+
|
||||
(osds_full > 1 ? " osds are full\n" : " osd is full\n");
|
||||
}
|
||||
if (osds_nearfull)
|
||||
{
|
||||
warning_str += " "+std::to_string(osds_nearfull)+
|
||||
(osds_nearfull > 1 ? " osds are almost full\n" : " osd is almost full\n");
|
||||
}
|
||||
if (warning_str != "")
|
||||
{
|
||||
warning_str = "\n warning:\n"+warning_str;
|
||||
}
|
||||
printf(
|
||||
" cluster:\n"
|
||||
" etcd: %d / %zd up, %s database size\n"
|
||||
" mon: %d up%s\n"
|
||||
" osd: %d / %d up\n"
|
||||
"%s"
|
||||
" \n"
|
||||
" data:\n"
|
||||
" raw: %s used, %s / %s available%s\n"
|
||||
|
@ -298,7 +330,7 @@ resume_2:
|
|||
"%s",
|
||||
etcd_alive, etcd_states.size(), format_size(etcd_db_size).c_str(),
|
||||
mon_count, mon_master == "" ? "" : (", master "+mon_master).c_str(),
|
||||
osd_up, osd_count,
|
||||
osd_up, osd_count, warning_str.c_str(),
|
||||
format_size(total_raw-free_raw).c_str(),
|
||||
format_size(free_raw-free_down_raw).c_str(),
|
||||
format_size(total_raw-down_raw).c_str(),
|
||||
|
|
|
@ -25,7 +25,8 @@ cluster_client_t::cluster_client_t(ring_loop_t *ringloop, timerfd_manager_t *tfd
|
|||
if (msgr.osd_peer_fds.find(peer_osd) != msgr.osd_peer_fds.end())
|
||||
{
|
||||
// peer_osd just connected
|
||||
continue_ops();
|
||||
// retry operations waiting for connection immediately
|
||||
continue_ops(client_retry_interval);
|
||||
continue_lists();
|
||||
continue_raw_ops(peer_osd);
|
||||
}
|
||||
|
@ -397,6 +398,8 @@ void cluster_client_t::on_load_config_hook(json11::Json::object & etcd_global_co
|
|||
client_eio_retry_interval = 10;
|
||||
}
|
||||
}
|
||||
// client_retry_enospc
|
||||
client_retry_enospc = config["client_retry_enospc"].is_null() ? true : config["client_retry_enospc"].bool_value();
|
||||
// log_level
|
||||
log_level = config["log_level"].uint64_value();
|
||||
msgr.parse_config(config);
|
||||
|
@ -817,7 +820,7 @@ resume_2:
|
|||
return 1;
|
||||
}
|
||||
else if (op->retval != 0 && !(op->flags & OP_FLUSH_BUFFER) &&
|
||||
op->retval != -EPIPE && (op->retval != -EIO || !client_eio_retry_interval) && op->retval != -ENOSPC)
|
||||
op->retval != -EPIPE && (op->retval != -EIO || !client_eio_retry_interval) && (op->retval != -ENOSPC || !client_retry_enospc))
|
||||
{
|
||||
// Fatal error (neither -EPIPE, -EIO nor -ENOSPC)
|
||||
erase_op(op);
|
||||
|
@ -1209,7 +1212,7 @@ void cluster_client_t::handle_op_part(cluster_op_part_t *part)
|
|||
// Set op->retry_after to retry operation after a short pause (not immediately)
|
||||
if (!op->retry_after)
|
||||
{
|
||||
op->retry_after = op->retval == -EIO ? client_eio_retry_interval : client_retry_interval;
|
||||
op->retry_after = op->retval != -EPIPE ? client_eio_retry_interval : client_retry_interval;
|
||||
}
|
||||
reset_retry_timer(op->retry_after);
|
||||
if (stop_fd >= 0)
|
||||
|
@ -1217,7 +1220,7 @@ void cluster_client_t::handle_op_part(cluster_op_part_t *part)
|
|||
msgr.stop_client(stop_fd);
|
||||
}
|
||||
op->inflight_count--;
|
||||
if (op->inflight_count == 0)
|
||||
if (op->inflight_count == 0 && !op->retry_after)
|
||||
{
|
||||
if (op->opcode == OSD_OP_SYNC)
|
||||
continue_sync(op);
|
||||
|
@ -1242,7 +1245,7 @@ void cluster_client_t::handle_op_part(cluster_op_part_t *part)
|
|||
{
|
||||
op->version = op->parts.size() == 1 ? part->op.reply.rw.version : 0;
|
||||
}
|
||||
if (op->inflight_count == 0)
|
||||
if (op->inflight_count == 0 && !op->retry_after)
|
||||
{
|
||||
if (op->opcode == OSD_OP_SYNC)
|
||||
continue_sync(op);
|
||||
|
|
|
@ -94,6 +94,7 @@ class cluster_client_t
|
|||
int log_level = 0;
|
||||
int client_retry_interval = 50; // ms
|
||||
int client_eio_retry_interval = 1000; // ms
|
||||
bool client_retry_enospc = true;
|
||||
|
||||
int retry_timeout_id = 0;
|
||||
int retry_timeout_duration = 0;
|
||||
|
|
|
@ -60,7 +60,7 @@ static const char *help_text =
|
|||
" --block_size 128k/1M Set blockstore object size\n"
|
||||
" --bitmap_granularity 4k Set bitmap granularity\n"
|
||||
" --data_csum_type none Set data checksum type (crc32c or none)\n"
|
||||
" --csum_block_size 4k Set data checksum block size\n"
|
||||
" --csum_block_size 4k/32k Set data checksum block size (SSD/HDD default)\n"
|
||||
" --data_device_block 4k Override data device block size\n"
|
||||
" --meta_device_block 4k Override metadata device block size\n"
|
||||
" --journal_device_block 4k Override journal device block size\n"
|
||||
|
|
|
@ -111,6 +111,8 @@ int disk_tool_t::prepare_one(std::map<std::string, std::string> options, int is_
|
|||
options["block_size"] = "1M";
|
||||
if (is_hybrid && options["throttle_small_writes"] == "")
|
||||
options["throttle_small_writes"] = "1";
|
||||
if (!is_hybrid && options.find("data_csum_type") != options.end() && options.at("data_csum_type") != "")
|
||||
options["csum_block_size"] = "32k";
|
||||
}
|
||||
else if (!json_is_true(options["disable_data_fsync"]))
|
||||
{
|
||||
|
|
|
@ -452,7 +452,7 @@ void kv_cli_t::handle_cmd(const std::vector<std::string> & cmd, std::function<vo
|
|||
auto name = cmd.size() > 1 ? cmd[1] : "";
|
||||
uint64_t pool_id = 0;
|
||||
inode_t inode_id = 0;
|
||||
int scanned = sscanf(name.c_str(), "%lu %lu", &pool_id, &inode_id);
|
||||
int scanned = sscanf(name.c_str(), "%ju %ju", &pool_id, &inode_id);
|
||||
if (scanned < 2 || !pool_id || !inode_id)
|
||||
{
|
||||
inode_id = 0;
|
||||
|
@ -483,7 +483,7 @@ void kv_cli_t::handle_cmd(const std::vector<std::string> & cmd, std::function<vo
|
|||
else
|
||||
{
|
||||
opened = true;
|
||||
fprintf(interactive ? stdout : stderr, "Index opened. Current size: %lu bytes\n", db->get_size());
|
||||
fprintf(interactive ? stdout : stderr, "Index opened. Current size: %ju bytes\n", db->get_size());
|
||||
}
|
||||
cb(res);
|
||||
});
|
||||
|
|
|
@ -250,14 +250,14 @@ int kv_block_t::parse(uint64_t offset, uint8_t *data, int size)
|
|||
{
|
||||
// empty block
|
||||
if (offset != 0)
|
||||
fprintf(stderr, "K/V: Block %lu is %s\n", offset, blk->magic == 0 ? "empty" : "cleared");
|
||||
fprintf(stderr, "K/V: Block %ju is %s\n", offset, blk->magic == 0 ? "empty" : "cleared");
|
||||
return -ENOTBLK;
|
||||
}
|
||||
if (blk->magic != KV_BLOCK_MAGIC || blk->block_size != size ||
|
||||
!blk->type || blk->type > KV_EMPTY || blk->items > KV_BLOCK_MAX_ITEMS)
|
||||
{
|
||||
// invalid block
|
||||
fprintf(stderr, "K/V: Invalid block %lu magic, size, type or item count\n", offset);
|
||||
fprintf(stderr, "K/V: Invalid block %ju magic, size, type or item count\n", offset);
|
||||
return -EILSEQ;
|
||||
}
|
||||
assert(!this->type);
|
||||
|
@ -266,13 +266,13 @@ int kv_block_t::parse(uint64_t offset, uint8_t *data, int size)
|
|||
this->key_ge = read_string(data, size, &pos);
|
||||
if (pos < 0)
|
||||
{
|
||||
fprintf(stderr, "K/V: Invalid block %lu left bound\n", offset);
|
||||
fprintf(stderr, "K/V: Invalid block %ju left bound\n", offset);
|
||||
return -EILSEQ;
|
||||
}
|
||||
this->key_lt = read_string(data, size, &pos);
|
||||
if (pos < 0)
|
||||
{
|
||||
fprintf(stderr, "K/V: Invalid block %lu right bound\n", offset);
|
||||
fprintf(stderr, "K/V: Invalid block %ju right bound\n", offset);
|
||||
return -EILSEQ;
|
||||
}
|
||||
if (this->type == KV_INT_SPLIT || this->type == KV_LEAF_SPLIT)
|
||||
|
@ -280,12 +280,12 @@ int kv_block_t::parse(uint64_t offset, uint8_t *data, int size)
|
|||
this->right_half = read_string(data, size, &pos);
|
||||
if (pos < 0)
|
||||
{
|
||||
fprintf(stderr, "K/V: Invalid block %lu split bound\n", offset);
|
||||
fprintf(stderr, "K/V: Invalid block %ju split bound\n", offset);
|
||||
return -EILSEQ;
|
||||
}
|
||||
if (pos+8 > size)
|
||||
{
|
||||
fprintf(stderr, "K/V: Invalid block %lu split block ref\n", offset);
|
||||
fprintf(stderr, "K/V: Invalid block %ju split block ref\n", offset);
|
||||
return -EILSEQ;
|
||||
}
|
||||
this->right_half_block = *(uint64_t*)(data+pos);
|
||||
|
@ -296,13 +296,13 @@ int kv_block_t::parse(uint64_t offset, uint8_t *data, int size)
|
|||
auto key = read_string(data, size, &pos);
|
||||
if (pos < 0)
|
||||
{
|
||||
fprintf(stderr, "K/V: Invalid block %lu key %d\n", offset, i);
|
||||
fprintf(stderr, "K/V: Invalid block %ju key %d\n", offset, i);
|
||||
return -EILSEQ;
|
||||
}
|
||||
auto value = read_string(data, size, &pos);
|
||||
if (pos < 0)
|
||||
{
|
||||
fprintf(stderr, "K/V: Invalid block %lu value %d\n", offset, i);
|
||||
fprintf(stderr, "K/V: Invalid block %ju value %d\n", offset, i);
|
||||
return -EILSEQ;
|
||||
}
|
||||
this->data[key] = value;
|
||||
|
@ -470,7 +470,7 @@ static void dump_str(const std::string & str)
|
|||
void kv_block_t::dump(int base_level)
|
||||
{
|
||||
printf(
|
||||
"{\n \"block\": %lu,\n \"level\": %d,\n \"type\": \"%s\",\n \"range\": [",
|
||||
"{\n \"block\": %ju,\n \"level\": %d,\n \"type\": \"%s\",\n \"range\": [",
|
||||
offset, base_level+level,
|
||||
type < sizeof(block_type_names)/sizeof(block_type_names[0]) ? block_type_names[type] : "unknown"
|
||||
);
|
||||
|
@ -482,7 +482,7 @@ void kv_block_t::dump(int base_level)
|
|||
{
|
||||
printf(" \"right_half\": { ");
|
||||
dump_str(right_half);
|
||||
printf(": %lu },\n", right_half_block);
|
||||
printf(": %ju },\n", right_half_block);
|
||||
}
|
||||
printf(" \"data\": {\n");
|
||||
for (auto & kv: data)
|
||||
|
@ -493,7 +493,7 @@ void kv_block_t::dump(int base_level)
|
|||
if (type == KV_LEAF || type == KV_LEAF_SPLIT || kv.second.size() != 8)
|
||||
dump_str(kv.second);
|
||||
else
|
||||
printf("%lu", *(uint64_t*)kv.second.c_str());
|
||||
printf("%ju", *(uint64_t*)kv.second.c_str());
|
||||
printf(",\n");
|
||||
}
|
||||
printf(" }\n}\n");
|
||||
|
@ -1037,7 +1037,7 @@ void kv_op_t::get()
|
|||
{
|
||||
if (cur_block != 0)
|
||||
{
|
||||
fprintf(stderr, "K/V: Hit empty block %lu while searching\n", cur_block);
|
||||
fprintf(stderr, "K/V: Hit empty block %ju while searching\n", cur_block);
|
||||
finish(-EILSEQ);
|
||||
}
|
||||
else
|
||||
|
@ -1095,7 +1095,7 @@ int kv_op_t::handle_block(int res, int refresh, bool stop_on_split)
|
|||
bool fatal = !this->updating_on_path && this->retry > 0;
|
||||
if (fatal || db->log_level > 0)
|
||||
{
|
||||
fprintf(stderr, "K/V: %sgot unrelated block %lu: key=%s range=[%s, %s) from=[%s, %s)\n",
|
||||
fprintf(stderr, "K/V: %sgot unrelated block %ju: key=%s range=[%s, %s) from=[%s, %s)\n",
|
||||
fatal ? "Error: " : "Warning: read/update collision: ",
|
||||
cur_block, key.c_str(), blk->key_ge.c_str(), blk->key_lt.c_str(), prev_key_ge.c_str(), prev_key_lt.c_str());
|
||||
if (fatal)
|
||||
|
@ -1160,7 +1160,7 @@ int kv_op_t::handle_block(int res, int refresh, bool stop_on_split)
|
|||
auto child_it = blk->data.upper_bound(key);
|
||||
if (child_it == blk->data.begin())
|
||||
{
|
||||
fprintf(stderr, "K/V: Internal block %lu misses boundary for %s\n", cur_block, key.c_str());
|
||||
fprintf(stderr, "K/V: Internal block %ju misses boundary for %s\n", cur_block, key.c_str());
|
||||
return -EILSEQ;
|
||||
}
|
||||
auto m = child_it == blk->data.end()
|
||||
|
@ -1169,7 +1169,7 @@ int kv_op_t::handle_block(int res, int refresh, bool stop_on_split)
|
|||
child_it--;
|
||||
if (child_it->second.size() != sizeof(uint64_t))
|
||||
{
|
||||
fprintf(stderr, "K/V: Internal block %lu reference is not 8 byte long\n", cur_block);
|
||||
fprintf(stderr, "K/V: Internal block %ju reference is not 8 byte long\n", cur_block);
|
||||
blk->dump(db->base_block_level);
|
||||
return -EILSEQ;
|
||||
}
|
||||
|
@ -1246,7 +1246,7 @@ static void write_block(kv_db_t *db, kv_block_t *blk, std::function<void(int)> c
|
|||
blk->dump(db->base_block_level);
|
||||
uint64_t old_size = blk->data_size;
|
||||
blk->set_data_size();
|
||||
fprintf(stderr, "K/V: block %lu (ptr=%lx) grew too large: tracked %lu, but real is %u bytes\n",
|
||||
fprintf(stderr, "K/V: block %ju (ptr=%jx) grew too large: tracked %ju, but real is %u bytes\n",
|
||||
blk->offset, (uint64_t)blk, old_size, blk->data_size);
|
||||
abort();
|
||||
return;
|
||||
|
@ -1479,7 +1479,7 @@ void kv_op_t::update_find()
|
|||
else if (cur_block == 0)
|
||||
finish(-ENOENT);
|
||||
else
|
||||
fprintf(stderr, "K/V: Hit empty block %lu while searching\n", cur_block);
|
||||
fprintf(stderr, "K/V: Hit empty block %ju while searching\n", cur_block);
|
||||
}
|
||||
else if (res == -ECHILD)
|
||||
{
|
||||
|
@ -1506,7 +1506,7 @@ void kv_op_t::create_root()
|
|||
// if a referenced non-root block is empty, we just return an error.
|
||||
if (cur_block != 0 || db->next_free != 0)
|
||||
{
|
||||
fprintf(stderr, "K/V: create_root called with non-empty DB (cur_block=%lu)\n", cur_block);
|
||||
fprintf(stderr, "K/V: create_root called with non-empty DB (cur_block=%ju)\n", cur_block);
|
||||
finish(-EILSEQ);
|
||||
return;
|
||||
}
|
||||
|
@ -1551,7 +1551,7 @@ void kv_op_t::resume_split()
|
|||
if (path.size() == 1)
|
||||
{
|
||||
// It shouldn't be the root block because we don't split it via INT_SPLIT/LEAF_SPLIT
|
||||
fprintf(stderr, "K/V: resume_split at root item (cur_block=%lu)\n", cur_block);
|
||||
fprintf(stderr, "K/V: resume_split at root item (cur_block=%ju)\n", cur_block);
|
||||
finish(-EILSEQ);
|
||||
return;
|
||||
}
|
||||
|
@ -1638,7 +1638,7 @@ void kv_op_t::update_block(int path_pos, bool is_delete, const std::string & key
|
|||
{
|
||||
blk->dump(0);
|
||||
// Should not happen - we should have resumed the split
|
||||
fprintf(stderr, "K/V: attempt to write into block %lu instead of resuming the split (got here from %s..%s)\n",
|
||||
fprintf(stderr, "K/V: attempt to write into block %ju instead of resuming the split (got here from %s..%s)\n",
|
||||
blk->offset, prev_key_ge.c_str(), prev_key_lt.c_str());
|
||||
abort();
|
||||
}
|
||||
|
@ -1650,7 +1650,7 @@ void kv_op_t::update_block(int path_pos, bool is_delete, const std::string & key
|
|||
// No need to split the block => just modify and write it
|
||||
if ((blk->type == KV_LEAF_SPLIT || blk->type == KV_INT_SPLIT) && key >= blk->right_half)
|
||||
{
|
||||
fprintf(stderr, "K/V: attempt to modify %s in unrelated split block %lu [%s..%s..%s)\n",
|
||||
fprintf(stderr, "K/V: attempt to modify %s in unrelated split block %ju [%s..%s..%s)\n",
|
||||
key.c_str(), blk->offset, blk->key_ge.c_str(), blk->right_half.c_str(), blk->key_lt.c_str());
|
||||
blk->dump(db->base_block_level);
|
||||
abort();
|
||||
|
@ -1753,11 +1753,11 @@ void kv_op_t::update_block(int path_pos, bool is_delete, const std::string & key
|
|||
clear_block(db, left_blk, 0, [=, left_offset = left_blk->offset](int res)
|
||||
{
|
||||
if (res < 0)
|
||||
fprintf(stderr, "Failed to clear unreferenced block %lu: %s (code %d)\n", left_offset, strerror(-res), res);
|
||||
fprintf(stderr, "Failed to clear unreferenced block %ju: %s (code %d)\n", left_offset, strerror(-res), res);
|
||||
clear_block(db, right_blk, 0, [=, right_offset = right_blk->offset](int res)
|
||||
{
|
||||
if (res < 0)
|
||||
fprintf(stderr, "Failed to clear unreferenced block %lu: %s (code %d)\n", right_offset, strerror(-res), res);
|
||||
fprintf(stderr, "Failed to clear unreferenced block %ju: %s (code %d)\n", right_offset, strerror(-res), res);
|
||||
// CAS failure - zero garbage left_blk and right_blk and retry from the beginning
|
||||
if (write_res == -EINTR)
|
||||
update();
|
||||
|
@ -1784,7 +1784,7 @@ void kv_op_t::update_block(int path_pos, bool is_delete, const std::string & key
|
|||
if (path_pos == 0)
|
||||
{
|
||||
// Block number zero should always be the root block
|
||||
fprintf(stderr, "K/V: root block is not 0, but %lu\n", cur_block);
|
||||
fprintf(stderr, "K/V: root block is not 0, but %ju\n", cur_block);
|
||||
cb(-EILSEQ);
|
||||
return;
|
||||
}
|
||||
|
@ -1809,7 +1809,7 @@ void kv_op_t::update_block(int path_pos, bool is_delete, const std::string & key
|
|||
clear_block(db, right_blk, 0, [=, right_offset = right_blk->offset](int res)
|
||||
{
|
||||
if (res < 0)
|
||||
fprintf(stderr, "Failed to clear unreferenced block %lu: %s (code %d)\n", right_offset, strerror(-res), res);
|
||||
fprintf(stderr, "Failed to clear unreferenced block %ju: %s (code %d)\n", right_offset, strerror(-res), res);
|
||||
// CAS failure - zero garbage right_blk and retry from the beginning
|
||||
if (write_res == -EINTR)
|
||||
update();
|
||||
|
@ -1860,7 +1860,7 @@ void kv_op_t::next_handle_block(int res, int refresh)
|
|||
finish(-ENOENT);
|
||||
else
|
||||
{
|
||||
fprintf(stderr, "K/V: Hit empty block %lu while searching\n", cur_block);
|
||||
fprintf(stderr, "K/V: Hit empty block %ju while searching\n", cur_block);
|
||||
finish(-EILSEQ);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -632,7 +632,7 @@ void kv_test_t::print_stats(kv_test_stat_t & prev_stat, timespec & prev_stat_tim
|
|||
char buf[128] = { 0 };
|
||||
for (int i = 0; i < sizeof(lats)/sizeof(lats[0]); i++)
|
||||
{
|
||||
snprintf(buf, sizeof(buf)-1, "%.1f %s/s (%lu us)", (lats[i]->count-prev[i]->count)*1000000.0/usec,
|
||||
snprintf(buf, sizeof(buf)-1, "%.1f %s/s (%ju us)", (lats[i]->count-prev[i]->count)*1000000.0/usec,
|
||||
lats[i]->name, (lats[i]->usec-prev[i]->usec)/(lats[i]->count-prev[i]->count > 0 ? lats[i]->count-prev[i]->count : 1));
|
||||
int k;
|
||||
for (k = strlen(buf); k < strlen(lats[i]->name)+21; k++)
|
||||
|
@ -652,7 +652,7 @@ void kv_test_t::print_stats(kv_test_stat_t & prev_stat, timespec & prev_stat_tim
|
|||
if (lats[i]->count > prev[i]->count)
|
||||
{
|
||||
printf(
|
||||
",\"%s\":{\"avg\":{\"iops\":%.1f,\"usec\":%lu},\"total\":{\"count\":%lu,\"usec\":%lu}}",
|
||||
",\"%s\":{\"avg\":{\"iops\":%.1f,\"usec\":%ju},\"total\":{\"count\":%ju,\"usec\":%ju}}",
|
||||
lats[i]->name, (lats[i]->count-prev[i]->count)*1000000.0/usec,
|
||||
(lats[i]->usec-prev[i]->usec)/(lats[i]->count-prev[i]->count),
|
||||
lats[i]->count, lats[i]->usec
|
||||
|
|
|
@ -2,21 +2,249 @@
|
|||
// License: VNPL-1.1 (see README.md for details)
|
||||
// Similar to qemu-nbd, but sets timeout and uses io_uring
|
||||
|
||||
#include <cerrno>
|
||||
#include <cstdint>
|
||||
#include <cstdio>
|
||||
#include <linux/genetlink.h>
|
||||
#include <linux/nbd.h>
|
||||
#include <linux/netlink.h>
|
||||
#include <sys/ioctl.h>
|
||||
|
||||
#include <sys/socket.h>
|
||||
#include <arpa/inet.h>
|
||||
#include <fcntl.h>
|
||||
#include <netinet/in.h>
|
||||
#include <netinet/tcp.h>
|
||||
#include <arpa/inet.h>
|
||||
#include <sys/un.h>
|
||||
#include <sys/epoll.h>
|
||||
#include <unistd.h>
|
||||
#include <fcntl.h>
|
||||
#include <signal.h>
|
||||
#include <sys/epoll.h>
|
||||
#include <sys/socket.h>
|
||||
#include <sys/un.h>
|
||||
#include <unistd.h>
|
||||
|
||||
#include "epoll_manager.h"
|
||||
#include "cluster_client.h"
|
||||
#include "epoll_manager.h"
|
||||
#include "str_util.h"
|
||||
|
||||
#ifdef HAVE_NBD_NETLINK_H
|
||||
#include <netlink/attr.h>
|
||||
#include <netlink/genl/ctrl.h>
|
||||
#include <netlink/genl/genl.h>
|
||||
#include <netlink/handlers.h>
|
||||
#include <netlink/msg.h>
|
||||
#include <netlink/netlink.h>
|
||||
#include <netlink/socket.h>
|
||||
#include <netlink/errno.h>
|
||||
#include <linux/nbd-netlink.h>
|
||||
|
||||
#define fail(...) { fprintf(stderr, __VA_ARGS__); exit(1); }
|
||||
|
||||
struct netlink_ctx
|
||||
{
|
||||
struct nl_sock *sk;
|
||||
int driver_id;
|
||||
};
|
||||
|
||||
static void netlink_sock_alloc(struct netlink_ctx *ctx)
|
||||
{
|
||||
struct nl_sock *sk;
|
||||
int nl_driver_id;
|
||||
|
||||
sk = nl_socket_alloc();
|
||||
if (!sk)
|
||||
{
|
||||
fail("Failed to alloc netlink socket\n");
|
||||
}
|
||||
|
||||
if (genl_connect(sk))
|
||||
{
|
||||
nl_socket_free(sk);
|
||||
fail("Couldn't connect to the generic netlink socket\n");
|
||||
}
|
||||
|
||||
nl_driver_id = genl_ctrl_resolve(sk, "nbd");
|
||||
if (nl_driver_id < 0)
|
||||
{
|
||||
nl_socket_free(sk);
|
||||
fail("Couldn't resolve the nbd netlink family\n");
|
||||
}
|
||||
|
||||
ctx->driver_id = nl_driver_id;
|
||||
ctx->sk = sk;
|
||||
}
|
||||
|
||||
static void netlink_sock_free(struct netlink_ctx *ctx)
|
||||
{
|
||||
free(ctx->sk);
|
||||
ctx->sk = NULL;
|
||||
}
|
||||
|
||||
static int netlink_status_cb(struct nl_msg *sk_msg, void *devnum)
|
||||
{
|
||||
struct nlmsghdr *nl_hdr;
|
||||
struct genlmsghdr *gnl_hdr;
|
||||
struct nlattr *msg_attr[NBD_ATTR_MAX + 1];
|
||||
struct nlattr *attr_data;
|
||||
int attr_len;
|
||||
uint32_t* dev_num;
|
||||
|
||||
dev_num = (uint32_t*)devnum;
|
||||
|
||||
nl_hdr = nlmsg_hdr(sk_msg);
|
||||
gnl_hdr = (struct genlmsghdr *)nlmsg_data(nl_hdr);
|
||||
attr_data = genlmsg_attrdata(gnl_hdr, 0);
|
||||
attr_len = genlmsg_attrlen(gnl_hdr, 0);
|
||||
|
||||
if (nla_parse(msg_attr, NBD_ATTR_MAX, attr_data, attr_len, NULL))
|
||||
{
|
||||
fail("Failed to parse netlink response\n");
|
||||
}
|
||||
|
||||
if (!msg_attr[NBD_ATTR_INDEX])
|
||||
{
|
||||
fail("Got malformed netlink reponse\n");
|
||||
}
|
||||
|
||||
*dev_num = nla_get_u32(msg_attr[NBD_ATTR_INDEX]);
|
||||
|
||||
return NL_OK;
|
||||
}
|
||||
|
||||
static int netlink_configure(const int *sockfd, int sock_size, int dev_num, uint64_t size,
|
||||
uint64_t blocksize, uint64_t flags, uint64_t cflags, uint64_t timeout, uint64_t conn_timeout,
|
||||
const char *backend, bool reconfigure)
|
||||
{
|
||||
struct netlink_ctx ctx;
|
||||
struct nlattr *msg_attr, *msg_opt_attr;
|
||||
struct nl_msg *msg;
|
||||
int i, err, sock;
|
||||
uint32_t devnum = dev_num;
|
||||
|
||||
if (reconfigure && dev_num < 0)
|
||||
{
|
||||
return -NLE_INVAL;
|
||||
}
|
||||
|
||||
netlink_sock_alloc(&ctx);
|
||||
|
||||
if (!reconfigure)
|
||||
{
|
||||
// A callback we set for a response we get on send
|
||||
nl_socket_modify_cb(ctx.sk, NL_CB_VALID, NL_CB_CUSTOM, netlink_status_cb, &devnum);
|
||||
}
|
||||
|
||||
msg = nlmsg_alloc();
|
||||
if (!msg)
|
||||
{
|
||||
netlink_sock_free(&ctx);
|
||||
fail("Failed to allocate netlink message\n");
|
||||
}
|
||||
|
||||
genlmsg_put(msg, NL_AUTO_PORT, NL_AUTO_SEQ, ctx.driver_id, 0, 0,
|
||||
reconfigure ? NBD_CMD_RECONFIGURE : NBD_CMD_CONNECT, 0);
|
||||
|
||||
if (dev_num >= 0)
|
||||
{
|
||||
NLA_PUT_U32(msg, NBD_ATTR_INDEX, (uint32_t)dev_num);
|
||||
}
|
||||
|
||||
NLA_PUT_U64(msg, NBD_ATTR_SIZE_BYTES, size);
|
||||
NLA_PUT_U64(msg, NBD_ATTR_BLOCK_SIZE_BYTES, blocksize);
|
||||
NLA_PUT_U64(msg, NBD_ATTR_SERVER_FLAGS, flags);
|
||||
NLA_PUT_U64(msg, NBD_ATTR_CLIENT_FLAGS, cflags);
|
||||
|
||||
if (timeout)
|
||||
{
|
||||
NLA_PUT_U64(msg, NBD_ATTR_TIMEOUT, timeout);
|
||||
}
|
||||
|
||||
if (conn_timeout)
|
||||
{
|
||||
NLA_PUT_U64(msg, NBD_ATTR_DEAD_CONN_TIMEOUT, conn_timeout);
|
||||
}
|
||||
|
||||
#ifdef NBD_ATTR_BACKEND_IDENTIFIER
|
||||
if (backend)
|
||||
{
|
||||
// Backend is an attribute useful for identication of the device
|
||||
// Also it prevents reconfiguration of the device with a different backend string
|
||||
NLA_PUT_STRING(msg, NBD_ATTR_BACKEND_IDENTIFIER, backend);
|
||||
}
|
||||
#endif
|
||||
|
||||
msg_attr = nla_nest_start(msg, NBD_ATTR_SOCKETS);
|
||||
if (!msg_attr)
|
||||
{
|
||||
goto nla_put_failure;
|
||||
}
|
||||
|
||||
for (i = 0; i < sock_size; i++)
|
||||
{
|
||||
msg_opt_attr = nla_nest_start(msg, NBD_SOCK_ITEM);
|
||||
if (!msg_opt_attr)
|
||||
{
|
||||
goto nla_put_failure;
|
||||
}
|
||||
|
||||
sock = sockfd[i];
|
||||
NLA_PUT_U32(msg, NBD_SOCK_FD, sock);
|
||||
|
||||
nla_nest_end(msg, msg_opt_attr);
|
||||
}
|
||||
|
||||
nla_nest_end(msg, msg_attr);
|
||||
|
||||
if ((err = nl_send_sync(ctx.sk, msg)) != 0)
|
||||
{
|
||||
netlink_sock_free(&ctx);
|
||||
return err;
|
||||
}
|
||||
|
||||
netlink_sock_free(&ctx);
|
||||
|
||||
return devnum;
|
||||
|
||||
nla_put_failure:
|
||||
nlmsg_free(msg);
|
||||
netlink_sock_free(&ctx);
|
||||
fail("Failed to create netlink message\n");
|
||||
}
|
||||
|
||||
static void netlink_disconnect(uint32_t dev_num)
|
||||
{
|
||||
struct netlink_ctx ctx;
|
||||
struct nl_msg *msg;
|
||||
int err;
|
||||
|
||||
netlink_sock_alloc(&ctx);
|
||||
|
||||
msg = nlmsg_alloc();
|
||||
if (!msg)
|
||||
{
|
||||
netlink_sock_free(&ctx);
|
||||
fail("Failed to allocate netlink message\n");
|
||||
}
|
||||
|
||||
genlmsg_put(msg, NL_AUTO_PORT, NL_AUTO_SEQ, ctx.driver_id, 0, 0, NBD_CMD_DISCONNECT, 0);
|
||||
NLA_PUT_U32(msg, NBD_ATTR_INDEX, dev_num);
|
||||
|
||||
if ((err = nl_send_sync(ctx.sk, msg)) < 0)
|
||||
{
|
||||
netlink_sock_free(&ctx);
|
||||
fail("Failed to send netlink message %d\n", err);
|
||||
}
|
||||
|
||||
netlink_sock_free(&ctx);
|
||||
|
||||
return;
|
||||
|
||||
nla_put_failure:
|
||||
nlmsg_free(msg);
|
||||
netlink_sock_free(&ctx);
|
||||
fail("Failed to create netlink message\n");
|
||||
}
|
||||
|
||||
#undef fail
|
||||
|
||||
#endif
|
||||
|
||||
#ifndef MSG_ZEROCOPY
|
||||
#define MSG_ZEROCOPY 0
|
||||
|
@ -24,13 +252,78 @@
|
|||
|
||||
const char *exe_name = NULL;
|
||||
|
||||
const char *help_text =
|
||||
"Vitastor NBD proxy " VERSION "\n"
|
||||
"(c) Vitaliy Filippov, 2020+ (VNPL-1.1)\n"
|
||||
"\n"
|
||||
"COMMANDS:\n"
|
||||
"\n"
|
||||
"vitastor-nbd map [OPTIONS] [/dev/nbdN] (--image <image> | --pool <pool> --inode <inode> --size <size in bytes>)\n"
|
||||
" Map an NBD device using ioctl interface. Options:\n"
|
||||
" --nbd_timeout 0\n"
|
||||
" Timeout for I/O operations in seconds after exceeding which the kernel stops the device.\n"
|
||||
" Before Linux 5.19, if nbd_timeout is 0, a dead NBD device can't be removed from\n"
|
||||
" the system at all without rebooting.\n"
|
||||
" --nbd_max_devices 64 --nbd_max_part 3\n"
|
||||
" Options for the \"nbd\" kernel module when modprobing it (nbds_max and max_part).\n"
|
||||
" --logfile /path/to/log/file.txt\n"
|
||||
" Write log messages to the specified file instead of dropping them (in background mode)\n"
|
||||
" or printing them to the standard output (in foreground mode).\n"
|
||||
" --dev_num N\n"
|
||||
" Use the specified device /dev/nbdN instead of automatic selection (alternative syntax\n"
|
||||
" to /dev/nbdN positional parameter).\n"
|
||||
" --foreground 1\n"
|
||||
" Stay in foreground, do not daemonize.\n"
|
||||
"\n"
|
||||
"vitastor-nbd unmap /dev/nbdN\n"
|
||||
" Unmap an ioctl-mapped NBD device.\n"
|
||||
"\n"
|
||||
"vitastor-nbd ls [--json]\n"
|
||||
" List ioctl-mapped Vitastor NBD devices, optionally in JSON format.\n"
|
||||
"\n"
|
||||
#ifdef HAVE_NBD_NETLINK_H
|
||||
"vitastor-nbd netlink-map [/dev/nbd<number>] (--image <image> | --pool <pool> --inode <inode> --size <size in bytes>)\n"
|
||||
" Map a device using netlink interface. Experimental mode. Differences from 'map':\n"
|
||||
" 1) netlink-map can create new /dev/nbdN devices.\n"
|
||||
" 2) netlink-mapped devices can be unmapped only using netlink-unmap command.\n"
|
||||
" 3) netlink-mapped devices don't show up `ls` output (yet).\n"
|
||||
" 4) dead netlink-mapped devices can be 'revived' (however, old I/O may hang forever without timeout).\n"
|
||||
" 5) netlink-map supports additional options:\n"
|
||||
" --nbd_conn_timeout 0\n"
|
||||
" Disconnect a dead device automatically after this number of seconds.\n"
|
||||
#ifdef NBD_CFLAG_DESTROY_ON_DISCONNECT
|
||||
" --nbd_destroy_on_disconnect 1\n"
|
||||
" Delete the nbd device on disconnect.\n"
|
||||
#endif
|
||||
#ifdef NBD_CFLAG_DISCONNECT_ON_CLOSE
|
||||
" --nbd_disconnect_on_close 1\n"
|
||||
" Disconnect the nbd device on close by last opener.\n"
|
||||
#endif
|
||||
#ifdef NBD_FLAG_READ_ONLY
|
||||
" --nbd_ro 1\n"
|
||||
" Set device into read only mode.\n"
|
||||
#endif
|
||||
"\n"
|
||||
"vitastor-nbd netlink-unmap /dev/nbdN\n"
|
||||
" Unmap a device using netlink interface. Works with both netlink and ioctl mapped devices.\n"
|
||||
"\n"
|
||||
"vitastor-nbd netlink-revive /dev/nbdN (--image <image> | --pool <pool> --inode <inode> --size <size in bytes>)\n"
|
||||
" Restart a dead NBD device without removing it. Supports the same options as netlink-map.\n"
|
||||
"\n"
|
||||
#endif
|
||||
"Use vitastor-nbd --help <command> for command details or vitastor-nbd --help --all for all details.\n"
|
||||
"\n"
|
||||
"All usual Vitastor config options like --config_file <path_to_config> may also be specified in CLI.\n"
|
||||
;
|
||||
|
||||
class nbd_proxy
|
||||
{
|
||||
protected:
|
||||
std::string image_name;
|
||||
uint64_t inode = 0;
|
||||
uint64_t device_size = 0;
|
||||
int nbd_timeout = 300;
|
||||
uint64_t nbd_conn_timeout = 0;
|
||||
int nbd_timeout = 0;
|
||||
int nbd_max_devices = 64;
|
||||
int nbd_max_part = 3;
|
||||
inode_watch_t *watch = NULL;
|
||||
|
@ -74,19 +367,19 @@ public:
|
|||
{
|
||||
if (!strcmp(args[i], "-h") || !strcmp(args[i], "--help"))
|
||||
{
|
||||
help();
|
||||
cfg["help"] = 1;
|
||||
}
|
||||
else if (args[i][0] == '-' && args[i][1] == '-')
|
||||
{
|
||||
const char *opt = args[i]+2;
|
||||
cfg[opt] = !strcmp(opt, "json") || i == narg-1 ? "1" : args[++i];
|
||||
cfg[opt] = !strcmp(opt, "json") || !strcmp(opt, "all") || i == narg-1 ? "1" : args[++i];
|
||||
}
|
||||
else if (pos == 0)
|
||||
{
|
||||
cfg["command"] = args[i];
|
||||
pos++;
|
||||
}
|
||||
else if (pos == 1 && (cfg["command"] == "map" || cfg["command"] == "unmap"))
|
||||
else if (pos == 1)
|
||||
{
|
||||
int n = 0;
|
||||
if (sscanf(args[i], "/dev/nbd%d", &n) > 0)
|
||||
|
@ -101,9 +394,13 @@ public:
|
|||
|
||||
void exec(json11::Json cfg)
|
||||
{
|
||||
if (cfg["help"].bool_value())
|
||||
{
|
||||
goto help;
|
||||
}
|
||||
if (cfg["command"] == "map")
|
||||
{
|
||||
start(cfg);
|
||||
start(cfg, false, false);
|
||||
}
|
||||
else if (cfg["command"] == "unmap")
|
||||
{
|
||||
|
@ -112,8 +409,28 @@ public:
|
|||
fprintf(stderr, "device name or number is missing\n");
|
||||
exit(1);
|
||||
}
|
||||
unmap(cfg["dev_num"].uint64_value());
|
||||
if (cfg["netlink"].is_null())
|
||||
{
|
||||
ioctl_unmap(cfg["dev_num"].uint64_value());
|
||||
}
|
||||
else
|
||||
{
|
||||
}
|
||||
}
|
||||
#ifdef HAVE_NBD_NETLINK_H
|
||||
else if (cfg["command"] == "netlink-map")
|
||||
{
|
||||
start(cfg, true, false);
|
||||
}
|
||||
else if (cfg["command"] == "netlink-revive")
|
||||
{
|
||||
start(cfg, true, true);
|
||||
}
|
||||
else if (cfg["command"] == "netlink-unmap")
|
||||
{
|
||||
netlink_disconnect(cfg["dev_num"].uint64_value());
|
||||
}
|
||||
#endif
|
||||
else if (cfg["command"] == "ls" || cfg["command"] == "list" || cfg["command"] == "list-mapped")
|
||||
{
|
||||
auto mapped = list_mapped();
|
||||
|
@ -121,43 +438,13 @@ public:
|
|||
}
|
||||
else
|
||||
{
|
||||
help();
|
||||
help:
|
||||
print_help(help_text, "vitastor-nbd", cfg["command"].string_value(), cfg["all"].bool_value());
|
||||
exit(0);
|
||||
}
|
||||
}
|
||||
|
||||
static void help()
|
||||
{
|
||||
printf(
|
||||
"Vitastor NBD proxy\n"
|
||||
"(c) Vitaliy Filippov, 2020-2021 (VNPL-1.1)\n\n"
|
||||
"USAGE:\n"
|
||||
" %s map [OPTIONS] (--image <image> | --pool <pool> --inode <inode> --size <size in bytes>)\n"
|
||||
" %s unmap /dev/nbd0\n"
|
||||
" %s ls [--json]\n"
|
||||
"OPTIONS:\n"
|
||||
" All usual Vitastor config options like --config_file <path_to_config> plus NBD-specific:\n"
|
||||
" --nbd_timeout 300\n"
|
||||
" Timeout for I/O operations in seconds after exceeding which the kernel stops\n"
|
||||
" the device. You can set it to 0 to disable the timeout, but beware that you\n"
|
||||
" won't be able to stop the device at all if vitastor-nbd process dies.\n"
|
||||
" --nbd_max_devices 64 --nbd_max_part 3\n"
|
||||
" Options for the \"nbd\" kernel module when modprobing it (nbds_max and max_part).\n"
|
||||
" note that maximum allowed (nbds_max)*(1+max_part) is 256.\n"
|
||||
" Note that nbd_timeout, nbd_max_devices and nbd_max_part options may also be specified\n"
|
||||
" in /etc/vitastor/vitastor.conf or in other configuration file specified with --config_file.\n"
|
||||
" --logfile /path/to/log/file.txt\n"
|
||||
" Write log messages to the specified file instead of dropping them (in background mode)\n"
|
||||
" or printing them to the standard output (in foreground mode).\n"
|
||||
" --dev_num N\n"
|
||||
" Use the specified device /dev/nbdN instead of automatic selection.\n"
|
||||
" --foreground 1\n"
|
||||
" Stay in foreground, do not daemonize.\n",
|
||||
exe_name, exe_name, exe_name
|
||||
);
|
||||
exit(0);
|
||||
}
|
||||
|
||||
void unmap(int dev_num)
|
||||
void ioctl_unmap(int dev_num)
|
||||
{
|
||||
char path[64] = { 0 };
|
||||
sprintf(path, "/dev/nbd%d", dev_num);
|
||||
|
@ -176,7 +463,7 @@ public:
|
|||
close(nbd);
|
||||
}
|
||||
|
||||
void start(json11::Json cfg)
|
||||
void start(json11::Json cfg, bool netlink, bool revive)
|
||||
{
|
||||
// Check options
|
||||
if (cfg["image"].string_value() != "")
|
||||
|
@ -206,19 +493,6 @@ public:
|
|||
exit(1);
|
||||
}
|
||||
}
|
||||
auto file_config = osd_messenger_t::read_config(cfg);
|
||||
if (file_config["nbd_max_devices"].is_number() || file_config["nbd_max_devices"].is_string())
|
||||
{
|
||||
nbd_max_devices = file_config["nbd_max_devices"].uint64_value();
|
||||
}
|
||||
if (file_config["nbd_max_part"].is_number() || file_config["nbd_max_part"].is_string())
|
||||
{
|
||||
nbd_max_part = file_config["nbd_max_part"].uint64_value();
|
||||
}
|
||||
if (file_config["nbd_timeout"].is_number() || file_config["nbd_timeout"].is_string())
|
||||
{
|
||||
nbd_timeout = file_config["nbd_timeout"].uint64_value();
|
||||
}
|
||||
if (cfg["client_writeback_allowed"].is_null())
|
||||
{
|
||||
// NBD is always aware of fsync, so we allow write-back cache
|
||||
|
@ -227,6 +501,7 @@ public:
|
|||
obj["client_writeback_allowed"] = true;
|
||||
cfg = obj;
|
||||
}
|
||||
|
||||
// Create client
|
||||
ringloop = new ring_loop_t(RINGLOOP_DEFAULT_SIZE);
|
||||
epmgr = new epoll_manager_t(ringloop);
|
||||
|
@ -250,6 +525,25 @@ public:
|
|||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
// cli->config contains merged config
|
||||
if (cli->config.find("nbd_max_devices") != cli->config.end())
|
||||
{
|
||||
nbd_max_devices = cli->config["nbd_max_devices"].uint64_value();
|
||||
}
|
||||
if (cli->config.find("nbd_max_part") != cli->config.end())
|
||||
{
|
||||
nbd_max_part = cli->config["nbd_max_part"].uint64_value();
|
||||
}
|
||||
if (cli->config.find("nbd_timeout") != cli->config.end())
|
||||
{
|
||||
nbd_timeout = cli->config["nbd_timeout"].uint64_value();
|
||||
}
|
||||
if (cli->config.find("nbd_conn_timeout") != cli->config.end())
|
||||
{
|
||||
nbd_conn_timeout = cli->config["nbd_conn_timeout"].uint64_value();
|
||||
}
|
||||
|
||||
// Initialize NBD
|
||||
int sockfd[2];
|
||||
if (socketpair(AF_UNIX, SOCK_STREAM, 0, sockfd) < 0)
|
||||
|
@ -257,46 +551,87 @@ public:
|
|||
perror("socketpair");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
fcntl(sockfd[0], F_SETFL, fcntl(sockfd[0], F_GETFL, 0) | O_NONBLOCK);
|
||||
nbd_fd = sockfd[0];
|
||||
load_module();
|
||||
bool bg = cfg["foreground"].is_null();
|
||||
if (!cfg["dev_num"].is_null())
|
||||
|
||||
if (netlink)
|
||||
{
|
||||
if (run_nbd(sockfd, cfg["dev_num"].int64_value(), device_size, NBD_FLAG_SEND_FLUSH, nbd_timeout, bg) < 0)
|
||||
#ifdef HAVE_NBD_NETLINK_H
|
||||
int devnum = -1;
|
||||
if (!cfg["dev_num"].is_null())
|
||||
{
|
||||
perror("run_nbd");
|
||||
devnum = (int)cfg["dev_num"].uint64_value();
|
||||
}
|
||||
uint64_t flags = NBD_FLAG_SEND_FLUSH;
|
||||
uint64_t cflags = 0;
|
||||
#ifdef NBD_FLAG_READ_ONLY
|
||||
if (!cfg["nbd_ro"].is_null())
|
||||
flags |= NBD_FLAG_READ_ONLY;
|
||||
#endif
|
||||
#ifdef NBD_CFLAG_DESTROY_ON_DISCONNECT
|
||||
if (!cfg["nbd_destroy_on_disconnect"].is_null())
|
||||
cflags |= NBD_CFLAG_DESTROY_ON_DISCONNECT;
|
||||
#endif
|
||||
#ifdef NBD_CFLAG_DISCONNECT_ON_CLOSE
|
||||
if (!cfg["nbd_disconnect_on_close"].is_null())
|
||||
cflags |= NBD_CFLAG_DISCONNECT_ON_CLOSE;
|
||||
#endif
|
||||
int err = netlink_configure(sockfd + 1, 1, devnum, device_size, 4096, flags, cflags, nbd_timeout, nbd_conn_timeout, NULL, revive);
|
||||
if (err < 0)
|
||||
{
|
||||
errno = (err == -NLE_BUSY ? EBUSY : EIO);
|
||||
fprintf(stderr, "netlink_configure failed: %s (code %d)\n", nl_geterror(err), err);
|
||||
exit(1);
|
||||
}
|
||||
close(sockfd[1]);
|
||||
printf("/dev/nbd%d\n", err);
|
||||
#else
|
||||
fprintf(stderr, "netlink support is disabled in this build\n");
|
||||
exit(1);
|
||||
#endif
|
||||
}
|
||||
else
|
||||
{
|
||||
// Find an unused device
|
||||
int i = 0;
|
||||
while (true)
|
||||
if (!cfg["dev_num"].is_null())
|
||||
{
|
||||
int r = run_nbd(sockfd, i, device_size, NBD_FLAG_SEND_FLUSH, nbd_timeout, bg);
|
||||
if (r == 0)
|
||||
if (run_nbd(sockfd, cfg["dev_num"].int64_value(), device_size, NBD_FLAG_SEND_FLUSH, nbd_timeout, bg) < 0)
|
||||
{
|
||||
printf("/dev/nbd%d\n", i);
|
||||
break;
|
||||
}
|
||||
else if (r == -1 && errno == ENOENT)
|
||||
{
|
||||
fprintf(stderr, "No free NBD devices found\n");
|
||||
exit(1);
|
||||
}
|
||||
else if (r == -2 && errno == EBUSY)
|
||||
{
|
||||
i++;
|
||||
}
|
||||
else
|
||||
{
|
||||
printf("%d %d\n", r, errno);
|
||||
perror("run_nbd");
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// Find an unused device
|
||||
int i = 0;
|
||||
while (true)
|
||||
{
|
||||
int r = run_nbd(sockfd, i, device_size, NBD_FLAG_SEND_FLUSH, nbd_timeout, bg);
|
||||
if (r == 0)
|
||||
{
|
||||
printf("/dev/nbd%d\n", i);
|
||||
break;
|
||||
}
|
||||
else if (r == -1 && errno == ENOENT)
|
||||
{
|
||||
fprintf(stderr, "No free NBD devices found\n");
|
||||
exit(1);
|
||||
}
|
||||
else if (r == -2 && errno == EBUSY)
|
||||
{
|
||||
i++;
|
||||
}
|
||||
else
|
||||
{
|
||||
printf("%d %d\n", r, errno);
|
||||
perror("run_nbd");
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if (cfg["logfile"].string_value() != "")
|
||||
{
|
||||
|
@ -368,9 +703,10 @@ public:
|
|||
return;
|
||||
}
|
||||
int r;
|
||||
// Kernel built-in default is 16 devices with up to 16 partitions per device which is a big shit
|
||||
// 64 also isn't too high, but the possible maximum is nbds_max=256 max_part=0 and it won't reserve
|
||||
// any block device minor numbers for partitions
|
||||
// NBD module creates ALL <nbd_max_devices> devices in /dev/ when loaded
|
||||
// Kernel built-in default is 16 devices with up to 16 partitions per device which is a bit too low.
|
||||
// ...and ioctl setup method can't create additional devices.
|
||||
// netlink setup method, however, CAN create additional devices.
|
||||
if ((r = system(("modprobe nbd nbds_max="+std::to_string(nbd_max_devices)+" max_part="+std::to_string(nbd_max_part)).c_str())) != 0)
|
||||
{
|
||||
if (r < 0)
|
||||
|
|
|
@ -497,7 +497,7 @@ static void extend_inode(nfs_client_t *self, uint64_t inode, uint64_t new_size)
|
|||
auto & ext = self->parent->blockfs->extends[inode];
|
||||
if (r.err)
|
||||
{
|
||||
fprintf(stderr, "Error extending inode %lu to %lu bytes: %s\n", inode, new_size, r.text.c_str());
|
||||
fprintf(stderr, "Error extending inode %ju to %ju bytes: %s\n", inode, new_size, r.text.c_str());
|
||||
}
|
||||
if (r.err == EAGAIN || ext.next_extend > ext.cur_extend)
|
||||
{
|
||||
|
|
|
@ -258,7 +258,7 @@ resume_3:
|
|||
auto name = kv_direntry_filename(st->cur_key);
|
||||
if (st->self->parent->trace)
|
||||
{
|
||||
fprintf(stderr, "[%d] READDIR %ju %lu %s\n",
|
||||
fprintf(stderr, "[%d] READDIR %ju %ju %s\n",
|
||||
st->self->nfs_fd, st->dir_ino, st->offset, name.c_str());
|
||||
}
|
||||
auto fh = kv_fh(ino);
|
||||
|
|
|
@ -608,7 +608,7 @@ static void nfs_kv_extend_inode(nfs_kv_write_state *st, int state, int base_stat
|
|||
auto ientry = json11::Json::parse(old_value, err).object_items();
|
||||
if (err != "")
|
||||
{
|
||||
fprintf(stderr, "Invalid JSON in inode %lu = %s: %s\n", st->ino, old_value.c_str(), err.c_str());
|
||||
fprintf(stderr, "Invalid JSON in inode %ju = %s: %s\n", st->ino, old_value.c_str(), err.c_str());
|
||||
st->res2 = -EINVAL;
|
||||
return false;
|
||||
}
|
||||
|
|
|
@ -301,8 +301,12 @@ class osd_t
|
|||
pg_osd_set_state_t* add_object_to_set(pg_t & pg, const object_id oid, const pg_osd_set_t & osd_set,
|
||||
uint64_t old_pg_state, int log_at_level);
|
||||
void remove_object_from_state(object_id & oid, pg_osd_set_state_t **object_state, pg_t &pg, bool report = true);
|
||||
pg_osd_set_state_t *mark_object(pg_t & pg, object_id oid, pg_osd_set_state_t *prev_object_state, bool ref,
|
||||
std::function<int(pg_osd_set_t & new_set)> calc_set);
|
||||
pg_osd_set_state_t *mark_object_corrupted(pg_t & pg, object_id oid, pg_osd_set_state_t *prev_object_state,
|
||||
osd_rmw_stripe_t *stripes, bool ref, bool inconsistent);
|
||||
pg_osd_set_state_t *mark_partial_write(pg_t & pg, object_id oid, pg_osd_set_state_t *prev_object_state,
|
||||
osd_rmw_stripe_t *stripes, bool ref);
|
||||
void deref_object_state(pg_t & pg, pg_osd_set_state_t **object_state, bool deref);
|
||||
bool remember_unstable_write(osd_op_t *cur_op, pg_t & pg, pg_osd_set_t & loc_set, int base_state);
|
||||
void handle_primary_subop(osd_op_t *subop, osd_op_t *cur_op);
|
||||
|
@ -317,6 +321,7 @@ class osd_t
|
|||
void submit_primary_del_batch(osd_op_t *cur_op, obj_ver_osd_t *chunks_to_delete, int chunks_to_delete_count);
|
||||
int submit_primary_sync_subops(osd_op_t *cur_op);
|
||||
void submit_primary_stab_subops(osd_op_t *cur_op);
|
||||
void submit_primary_rollback_subops(osd_op_t *cur_op, const uint64_t* osd_set);
|
||||
|
||||
uint64_t* get_object_osd_set(pg_t &pg, object_id &oid, pg_osd_set_state_t **object_state);
|
||||
|
||||
|
|
|
@ -534,7 +534,7 @@ void osd_t::renew_lease(bool reload)
|
|||
{ "ID", etcd_lease_id }
|
||||
}, st_cli.etcd_quick_timeout, 0, 0, [this, reload](std::string err, json11::Json data)
|
||||
{
|
||||
if (err == "" && data["result"]["TTL"].string_value() == "")
|
||||
if (err == "" && data["result"]["TTL"].uint64_value() == 0)
|
||||
{
|
||||
// Die
|
||||
fprintf(stderr, "Error refreshing etcd lease\n");
|
||||
|
|
|
@ -258,6 +258,9 @@ struct __attribute__((__packed__)) osd_op_describe_t
|
|||
uint64_t max_inode, max_offset;
|
||||
// limit
|
||||
uint64_t limit;
|
||||
// pool and PG
|
||||
uint32_t pool_id;
|
||||
uint32_t pg_num;
|
||||
};
|
||||
|
||||
struct __attribute__((__packed__)) osd_reply_describe_t
|
||||
|
|
|
@ -299,8 +299,8 @@ resume_2:
|
|||
finish_op(cur_op, cur_op->req.rw.len);
|
||||
}
|
||||
|
||||
pg_osd_set_state_t *osd_t::mark_object_corrupted(pg_t & pg, object_id oid, pg_osd_set_state_t *prev_object_state,
|
||||
osd_rmw_stripe_t *stripes, bool ref, bool inconsistent)
|
||||
pg_osd_set_state_t *osd_t::mark_object(pg_t & pg, object_id oid, pg_osd_set_state_t *prev_object_state, bool ref,
|
||||
std::function<int(pg_osd_set_t & new_set)> calc_set)
|
||||
{
|
||||
pg_osd_set_state_t *object_state = NULL;
|
||||
get_object_osd_set(pg, oid, &object_state);
|
||||
|
@ -315,58 +315,22 @@ pg_osd_set_state_t *osd_t::mark_object_corrupted(pg_t & pg, object_id oid, pg_os
|
|||
}
|
||||
return object_state;
|
||||
}
|
||||
pg_osd_set_t corrupted_set;
|
||||
pg_osd_set_t new_set;
|
||||
if (object_state)
|
||||
{
|
||||
corrupted_set = object_state->osd_set;
|
||||
new_set = object_state->osd_set;
|
||||
}
|
||||
else
|
||||
{
|
||||
for (int i = 0; i < pg.cur_set.size(); i++)
|
||||
{
|
||||
corrupted_set.push_back((pg_obj_loc_t){
|
||||
new_set.push_back((pg_obj_loc_t){
|
||||
.role = (pg.scheme == POOL_SCHEME_REPLICATED ? 0 : (uint64_t)i),
|
||||
.osd_num = pg.cur_set[i],
|
||||
});
|
||||
}
|
||||
}
|
||||
// Mark object chunk(s) as corrupted
|
||||
int changes = 0;
|
||||
for (auto chunk_it = corrupted_set.begin(); chunk_it != corrupted_set.end(); )
|
||||
{
|
||||
auto & chunk = *chunk_it;
|
||||
if (stripes[chunk.role].osd_num == chunk.osd_num)
|
||||
{
|
||||
if (stripes[chunk.role].not_exists)
|
||||
{
|
||||
changes++;
|
||||
corrupted_set.erase(chunk_it, chunk_it+1);
|
||||
continue;
|
||||
}
|
||||
if (stripes[chunk.role].read_error && chunk.loc_bad != LOC_CORRUPTED)
|
||||
{
|
||||
changes++;
|
||||
chunk.loc_bad = LOC_CORRUPTED;
|
||||
}
|
||||
else if (stripes[chunk.role].read_end > 0 && !stripes[chunk.role].missing &&
|
||||
(chunk.loc_bad & LOC_CORRUPTED))
|
||||
{
|
||||
changes++;
|
||||
chunk.loc_bad &= ~LOC_CORRUPTED;
|
||||
}
|
||||
}
|
||||
if (inconsistent && !chunk.loc_bad)
|
||||
{
|
||||
changes++;
|
||||
chunk.loc_bad |= LOC_INCONSISTENT;
|
||||
}
|
||||
else if (!inconsistent && (chunk.loc_bad & LOC_INCONSISTENT))
|
||||
{
|
||||
changes++;
|
||||
chunk.loc_bad &= ~LOC_INCONSISTENT;
|
||||
}
|
||||
chunk_it++;
|
||||
}
|
||||
int changes = calc_set(new_set);
|
||||
if (!changes)
|
||||
{
|
||||
// No chunks newly marked as corrupted - object is already marked or moved
|
||||
|
@ -379,7 +343,7 @@ pg_osd_set_state_t *osd_t::mark_object_corrupted(pg_t & pg, object_id oid, pg_os
|
|||
deref_object_state(pg, &object_state, ref);
|
||||
}
|
||||
// Insert object into the new state and retry
|
||||
object_state = add_object_to_set(pg, oid, corrupted_set, old_pg_state, 2);
|
||||
object_state = add_object_to_set(pg, oid, new_set, old_pg_state, 2);
|
||||
if (ref)
|
||||
{
|
||||
object_state->ref_count++;
|
||||
|
@ -387,6 +351,76 @@ pg_osd_set_state_t *osd_t::mark_object_corrupted(pg_t & pg, object_id oid, pg_os
|
|||
return object_state;
|
||||
}
|
||||
|
||||
pg_osd_set_state_t *osd_t::mark_object_corrupted(pg_t & pg, object_id oid, pg_osd_set_state_t *prev_object_state,
|
||||
osd_rmw_stripe_t *stripes, bool ref, bool inconsistent)
|
||||
{
|
||||
return mark_object(pg, oid, prev_object_state, ref, [stripes, inconsistent](pg_osd_set_t & new_set)
|
||||
{
|
||||
// Mark object chunk(s) as corrupted
|
||||
int changes = 0;
|
||||
for (auto chunk_it = new_set.begin(); chunk_it != new_set.end(); )
|
||||
{
|
||||
auto & chunk = *chunk_it;
|
||||
if (stripes[chunk.role].osd_num == chunk.osd_num)
|
||||
{
|
||||
if (stripes[chunk.role].not_exists)
|
||||
{
|
||||
changes++;
|
||||
new_set.erase(chunk_it, chunk_it+1);
|
||||
continue;
|
||||
}
|
||||
if (stripes[chunk.role].read_error && chunk.loc_bad != LOC_CORRUPTED)
|
||||
{
|
||||
changes++;
|
||||
chunk.loc_bad = LOC_CORRUPTED;
|
||||
}
|
||||
else if (stripes[chunk.role].read_end > 0 && !stripes[chunk.role].missing &&
|
||||
(chunk.loc_bad & LOC_CORRUPTED))
|
||||
{
|
||||
changes++;
|
||||
chunk.loc_bad &= ~LOC_CORRUPTED;
|
||||
}
|
||||
}
|
||||
if (inconsistent && !chunk.loc_bad)
|
||||
{
|
||||
changes++;
|
||||
chunk.loc_bad |= LOC_INCONSISTENT;
|
||||
}
|
||||
else if (!inconsistent && (chunk.loc_bad & LOC_INCONSISTENT))
|
||||
{
|
||||
changes++;
|
||||
chunk.loc_bad &= ~LOC_INCONSISTENT;
|
||||
}
|
||||
chunk_it++;
|
||||
}
|
||||
return changes;
|
||||
});
|
||||
}
|
||||
|
||||
// Mark the object as partially updated (probably due to a ENOSPC)
|
||||
pg_osd_set_state_t *osd_t::mark_partial_write(pg_t & pg, object_id oid, pg_osd_set_state_t *prev_object_state,
|
||||
osd_rmw_stripe_t *stripes, bool ref)
|
||||
{
|
||||
return mark_object(pg, oid, prev_object_state, ref, [stripes](pg_osd_set_t & new_set)
|
||||
{
|
||||
// Mark object chunk(s) as outdated
|
||||
int changes = 0;
|
||||
for (auto chunk_it = new_set.begin(); chunk_it != new_set.end(); )
|
||||
{
|
||||
auto & chunk = *chunk_it;
|
||||
if (stripes[chunk.role].osd_num == chunk.osd_num &&
|
||||
stripes[chunk.role].read_error &&
|
||||
chunk.loc_bad != LOC_OUTDATED)
|
||||
{
|
||||
changes++;
|
||||
chunk.loc_bad = LOC_OUTDATED;
|
||||
}
|
||||
chunk_it++;
|
||||
}
|
||||
return changes;
|
||||
});
|
||||
}
|
||||
|
||||
pg_osd_set_state_t* osd_t::add_object_to_set(pg_t & pg, const object_id oid, const pg_osd_set_t & osd_set,
|
||||
uint64_t old_pg_state, int log_at_level)
|
||||
{
|
||||
|
|
|
@ -25,7 +25,7 @@ struct osd_primary_op_data_t
|
|||
uint64_t target_ver;
|
||||
uint64_t orig_ver = 0, fact_ver = 0;
|
||||
uint64_t scheme = 0;
|
||||
int n_subops = 0, done = 0, errors = 0, errcode = 0;
|
||||
int n_subops = 0, done = 0, errors = 0, drops = 0, errcode = 0;
|
||||
int degraded = 0, pg_size, pg_data_size;
|
||||
osd_rmw_stripe_t *stripes;
|
||||
osd_op_t *subops = NULL;
|
||||
|
|
|
@ -95,7 +95,19 @@ void osd_t::continue_primary_describe(osd_op_t *cur_op)
|
|||
if (!desc.object_state)
|
||||
desc.object_state = ~desc.object_state;
|
||||
std::vector<unclean_list_t> lists;
|
||||
for (auto pg_it = pgs.begin(); pg_it != pgs.end(); pg_it++)
|
||||
auto pg_first = pgs.begin();
|
||||
auto pg_last = pgs.end();
|
||||
if (desc.pool_id && desc.pg_num)
|
||||
{
|
||||
pg_first = pgs.find((pool_pg_num_t){ .pool_id = desc.pool_id, .pg_num = desc.pg_num });
|
||||
pg_last = pg_first != pgs.end() ? std::next(pg_first) : pgs.end();
|
||||
}
|
||||
else if (desc.pool_id)
|
||||
{
|
||||
pg_first = pgs.lower_bound((pool_pg_num_t){ .pool_id = desc.pool_id });
|
||||
pg_last = pgs.lower_bound((pool_pg_num_t){ .pool_id = desc.pool_id+1 });
|
||||
}
|
||||
for (auto pg_it = pg_first; pg_it != pg_last; pg_it++)
|
||||
{
|
||||
auto & pg = pg_it->second;
|
||||
if (desc.object_state & OBJ_INCONSISTENT)
|
||||
|
|
|
@ -133,7 +133,7 @@ void osd_t::submit_primary_subops(int submit_type, uint64_t op_version, const ui
|
|||
zero_read = -1;
|
||||
osd_op_t *subops = new osd_op_t[n_subops];
|
||||
op_data->fact_ver = 0;
|
||||
op_data->done = op_data->errors = op_data->errcode = 0;
|
||||
op_data->done = op_data->errors = op_data->drops = op_data->errcode = 0;
|
||||
op_data->n_subops = n_subops;
|
||||
op_data->subops = subops;
|
||||
int sent = submit_primary_subop_batch(submit_type, op_data->oid.inode, op_version, op_data->stripes, osd_set, cur_op, 0, zero_read);
|
||||
|
@ -363,6 +363,13 @@ void osd_t::handle_primary_subop(osd_op_t *subop, osd_op_t *cur_op)
|
|||
memset(((osd_rmw_stripe_t*)subop->rmw_buf)->read_buf, 0, expected);
|
||||
((osd_rmw_stripe_t*)subop->rmw_buf)->not_exists = true;
|
||||
}
|
||||
if (opcode == OSD_OP_SEC_READ && (retval == -EIO || retval == -EDOM) ||
|
||||
opcode == OSD_OP_SEC_WRITE && retval != expected)
|
||||
{
|
||||
// We'll retry reads from other replica(s) on EIO/EDOM and mark object as corrupted
|
||||
// And we'll mark write as failed
|
||||
((osd_rmw_stripe_t*)subop->rmw_buf)->read_error = true;
|
||||
}
|
||||
if (retval == expected && (opcode == OSD_OP_SEC_READ || opcode == OSD_OP_SEC_WRITE || opcode == OSD_OP_SEC_WRITE_STABLE))
|
||||
{
|
||||
uint64_t version = subop->reply.sec_rw.version;
|
||||
|
@ -404,14 +411,10 @@ void osd_t::handle_primary_subop(osd_op_t *subop, osd_op_t *cur_op)
|
|||
osd_op_names[opcode], subop->peer_fd, retval, expected
|
||||
);
|
||||
}
|
||||
if (opcode == OSD_OP_SEC_READ && (retval == -EIO || retval == -EDOM))
|
||||
{
|
||||
// We'll retry reads from other replica(s) on EIO/EDOM and mark object as corrupted
|
||||
((osd_rmw_stripe_t*)subop->rmw_buf)->read_error = true;
|
||||
}
|
||||
subop->rmw_buf = NULL;
|
||||
// Error priority: ENOSPC and others > EIO > EDOM > EPIPE
|
||||
// Error priority: ENOSPC > others > EIO > EDOM > EPIPE
|
||||
if (op_data->errcode == 0 ||
|
||||
retval == -ENOSPC && op_data->errcode != -ENOSPC ||
|
||||
retval == -EIO && (op_data->errcode == -EDOM || op_data->errcode == -EPIPE) ||
|
||||
retval == -EDOM && (op_data->errcode == -EPIPE) ||
|
||||
retval != -EIO && retval != -EDOM && retval != -EPIPE)
|
||||
|
@ -424,6 +427,7 @@ void osd_t::handle_primary_subop(osd_op_t *subop, osd_op_t *cur_op)
|
|||
(retval != -EIO || opcode != OSD_OP_SEC_READ))
|
||||
{
|
||||
// Drop connection on unexpected errors
|
||||
op_data->drops++;
|
||||
msgr.stop_client(subop->peer_fd);
|
||||
}
|
||||
}
|
||||
|
@ -705,6 +709,96 @@ void osd_t::submit_primary_stab_subops(osd_op_t *cur_op)
|
|||
}
|
||||
}
|
||||
|
||||
void osd_t::submit_primary_rollback_subops(osd_op_t *cur_op, const uint64_t* osd_set)
|
||||
{
|
||||
osd_primary_op_data_t *op_data = cur_op->op_data;
|
||||
osd_rmw_stripe_t *stripes = op_data->stripes;
|
||||
assert(op_data->scheme != POOL_SCHEME_REPLICATED);
|
||||
// Allocate subops
|
||||
int n_subops = 0;
|
||||
for (int role = 0; role < op_data->pg_size; role++)
|
||||
{
|
||||
if (osd_set[role] != 0 && !stripes[role].read_error &&
|
||||
msgr.osd_peer_fds.find(osd_set[role]) != msgr.osd_peer_fds.end())
|
||||
{
|
||||
n_subops++;
|
||||
}
|
||||
}
|
||||
op_data->n_subops = n_subops;
|
||||
op_data->done = op_data->errors = 0;
|
||||
if (!op_data->n_subops)
|
||||
{
|
||||
return;
|
||||
}
|
||||
op_data->subops = new osd_op_t[n_subops];
|
||||
op_data->unstable_writes = new obj_ver_id[n_subops];
|
||||
int i = 0;
|
||||
for (int role = 0; role < op_data->pg_size; role++)
|
||||
{
|
||||
if (osd_set[role] != 0 && !stripes[role].read_error &&
|
||||
msgr.osd_peer_fds.find(osd_set[role]) != msgr.osd_peer_fds.end())
|
||||
{
|
||||
osd_op_t *subop = &op_data->subops[i];
|
||||
op_data->unstable_writes[i] = (obj_ver_id){
|
||||
.oid = {
|
||||
.inode = op_data->oid.inode,
|
||||
.stripe = op_data->oid.stripe | role,
|
||||
},
|
||||
.version = op_data->target_ver-1,
|
||||
};
|
||||
if (osd_set[role] == this->osd_num)
|
||||
{
|
||||
clock_gettime(CLOCK_REALTIME, &subop->tv_begin);
|
||||
subop->op_type = (uint64_t)cur_op;
|
||||
subop->bs_op = new blockstore_op_t((blockstore_op_t){
|
||||
.opcode = BS_OP_ROLLBACK,
|
||||
.callback = [subop, this](blockstore_op_t *bs_subop)
|
||||
{
|
||||
handle_primary_bs_subop(subop);
|
||||
},
|
||||
{
|
||||
.len = 1,
|
||||
},
|
||||
.buf = (void*)(op_data->unstable_writes + i),
|
||||
});
|
||||
#ifdef OSD_DEBUG
|
||||
printf(
|
||||
"Submit rollback to local: %jx:%jx v%ju\n",
|
||||
op_data->oid.inode, op_data->oid.stripe | role, op_data->target_ver-1
|
||||
);
|
||||
#endif
|
||||
bs->enqueue_op(subop->bs_op);
|
||||
}
|
||||
else
|
||||
{
|
||||
subop->op_type = OSD_OP_OUT;
|
||||
subop->req = (osd_any_op_t){ .sec_stab = {
|
||||
.header = {
|
||||
.magic = SECONDARY_OSD_OP_MAGIC,
|
||||
.id = msgr.next_subop_id++,
|
||||
.opcode = OSD_OP_SEC_ROLLBACK,
|
||||
},
|
||||
.len = sizeof(obj_ver_id),
|
||||
} };
|
||||
subop->iov.push_back(op_data->unstable_writes + i, sizeof(obj_ver_id));
|
||||
subop->callback = [cur_op, this](osd_op_t *subop)
|
||||
{
|
||||
handle_primary_subop(subop, cur_op);
|
||||
};
|
||||
#ifdef OSD_DEBUG
|
||||
printf(
|
||||
"Submit rollback to osd %ju: %jx:%jx v%ju\n", osd_set[role],
|
||||
op_data->oid.inode, op_data->oid.stripe | role, op_data->target_ver-1
|
||||
);
|
||||
#endif
|
||||
subop->peer_fd = msgr.osd_peer_fds.at(osd_set[role]);
|
||||
msgr.outbox_push(subop);
|
||||
}
|
||||
i++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void osd_t::pg_cancel_write_queue(pg_t & pg, osd_op_t *first_op, object_id oid, int retval)
|
||||
{
|
||||
auto st_it = pg.write_queue.find(oid), it = st_it;
|
||||
|
|
|
@ -49,6 +49,8 @@ void osd_t::continue_primary_write(osd_op_t *cur_op)
|
|||
else if (op_data->st == 8) goto resume_8;
|
||||
else if (op_data->st == 9) goto resume_9;
|
||||
else if (op_data->st == 10) goto resume_10;
|
||||
else if (op_data->st == 11) goto resume_11;
|
||||
else if (op_data->st == 12) goto resume_12;
|
||||
assert(op_data->st == 0);
|
||||
if (!check_write_queue(cur_op, pg))
|
||||
{
|
||||
|
@ -259,11 +261,31 @@ resume_5:
|
|||
}
|
||||
if (op_data->errors > 0)
|
||||
{
|
||||
// FIXME: Handle ENOSPC. If one of the subops fail with ENOSPC here,
|
||||
// Handle ENOSPC/EDOM/ERANGE/EIO. If some subops fail, but others succeed,
|
||||
// next writes to the same object will also fail because they'll try
|
||||
// to overwrite the same version number which will result in EEXIST.
|
||||
// To fix it, we should mark the object as degraded for replicas,
|
||||
// and rollback successful part updates in case of EC.
|
||||
if (op_data->done > 0 && !op_data->drops)
|
||||
{
|
||||
if (op_data->scheme != POOL_SCHEME_REPLICATED)
|
||||
{
|
||||
submit_primary_rollback_subops(cur_op, op_data->prev_set);
|
||||
resume_11:
|
||||
op_data->st = 11;
|
||||
return;
|
||||
resume_12:
|
||||
// Ignore ROLLBACK errors - submit_primary_subops will drop the connection if it fails
|
||||
delete[] op_data->unstable_writes;
|
||||
op_data->unstable_writes = NULL;
|
||||
}
|
||||
else
|
||||
{
|
||||
mark_partial_write(pg, op_data->oid, op_data->object_state, op_data->stripes, true);
|
||||
pg_cancel_write_queue(pg, cur_op, op_data->oid, op_data->errcode);
|
||||
return;
|
||||
}
|
||||
}
|
||||
deref_object_state(pg, &op_data->object_state, true);
|
||||
pg_cancel_write_queue(pg, cur_op, op_data->oid, op_data->errcode);
|
||||
return;
|
||||
|
|
|
@ -209,7 +209,7 @@ void print_help(const char *help_text, std::string exe_name, std::string cmd, bo
|
|||
const char *var_end = var_start;
|
||||
while (*var_end && !isspace(*var_end))
|
||||
var_end++;
|
||||
if ((std::string(var_start, var_end-var_start)+"|").find(cmd+"|") != std::string::npos)
|
||||
if (("|"+std::string(var_start, var_end-var_start)+"|").find("|"+cmd+"|") != std::string::npos)
|
||||
found = matched = true;
|
||||
}
|
||||
else if (*next_line && isspace(*next_line))
|
||||
|
|
|
@ -6,7 +6,7 @@ includedir=${prefix}/@CMAKE_INSTALL_INCLUDEDIR@
|
|||
|
||||
Name: Vitastor
|
||||
Description: Vitastor client library
|
||||
Version: 1.5.0
|
||||
Version: 1.6.1
|
||||
Libs: -L${libdir} -lvitastor_client
|
||||
Cflags: -I${includedir}
|
||||
|
||||
|
|
|
@ -48,7 +48,7 @@ start_etcd()
|
|||
--advertise-client-urls http://$ETCD_IP:$((ETCD_PORT+2*i-2)) --listen-client-urls http://$ETCD_IP:$((ETCD_PORT+2*i-2)) \
|
||||
--initial-advertise-peer-urls http://$ETCD_IP:$((ETCD_PORT+2*i-1)) --listen-peer-urls http://$ETCD_IP:$((ETCD_PORT+2*i-1)) \
|
||||
--initial-cluster-token vitastor-tests-etcd --initial-cluster-state new \
|
||||
--initial-cluster "$ETCD_CLUSTER" \
|
||||
--initial-cluster "$ETCD_CLUSTER" --max-request-bytes=104857600 \
|
||||
--max-txn-ops=100000 --auto-compaction-retention=10 --auto-compaction-mode=revision &>./testdata/etcd$i.log &
|
||||
eval ETCD${i}_PID=$!
|
||||
}
|
||||
|
|
|
@ -45,6 +45,8 @@ IMMEDIATE_COMMIT=1 ./test_rebalance_verify.sh
|
|||
SCHEME=ec ./test_rebalance_verify.sh
|
||||
SCHEME=ec IMMEDIATE_COMMIT=1 ./test_rebalance_verify.sh
|
||||
|
||||
./test_root_node.sh
|
||||
|
||||
./test_switch_primary.sh
|
||||
|
||||
./test_write.sh
|
||||
|
@ -62,6 +64,13 @@ TEST_NAME=csum_4k_dmj OSD_ARGS="--data_csum_type crc32c --inmemory_metadata fal
|
|||
TEST_NAME=csum_4k_dj OSD_ARGS="--data_csum_type crc32c --inmemory_journal false" OFFSET_ARGS=$OSD_ARGS ./test_heal.sh
|
||||
TEST_NAME=csum_4k OSD_ARGS="--data_csum_type crc32c" OFFSET_ARGS=$OSD_ARGS ./test_heal.sh
|
||||
|
||||
./test_osd_tags.sh
|
||||
|
||||
./test_enospc.sh
|
||||
SCHEME=xor ./test_enospc.sh
|
||||
IMMEDIATE_COMMIT=1 ./test_enospc.sh
|
||||
IMMEDIATE_COMMIT=1 SCHEME=xor ./test_enospc.sh
|
||||
|
||||
./test_scrub.sh
|
||||
ZERO_OSD=2 ./test_scrub.sh
|
||||
SCHEME=xor ./test_scrub.sh
|
||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue