Compare commits
25 Commits
Author | SHA1 | Date | |
---|---|---|---|
fb2f7a0d3c | |||
38d85da19a | |||
dc3caee284 | |||
89dcda1fed | |||
1526e2055e | |||
74cb3911db | |||
d5efbbb6b9 | |||
4319091bd3 | |||
6d307d5391 | |||
065dfef683 | |||
4d6b85fe67 | |||
2dd2f29f46 | |||
fc3a1e076a | |||
3a3e168c42 | |||
95c55da0ad | |||
5cf1157f16 | |||
acf637950c | |||
a02b02eb04 | |||
7d3d696110 | |||
![]() |
712576ca75 | ||
28bd94d2c2 | |||
148ff04aa8 | |||
![]() |
e86df4a2a2 | ||
e74af9745e | |||
0e0509e3da |
@@ -2,6 +2,6 @@ cmake_minimum_required(VERSION 2.8)
|
||||
|
||||
project(vitastor)
|
||||
|
||||
set(VERSION "0.6.5")
|
||||
set(VERSION "0.6.6")
|
||||
|
||||
add_subdirectory(src)
|
||||
|
@@ -40,7 +40,7 @@ Vitastor на данный момент находится в статусе п
|
||||
- Драйвер диска для QEMU (собирается вне дерева исходников QEMU)
|
||||
- Драйвер диска для утилиты тестирования производительности fio (также собирается вне дерева исходников fio)
|
||||
- NBD-прокси для монтирования образов ядром ("блочное устройство в режиме пользователя")
|
||||
- Утилита удаления образов/инодов (vitastor-rm)
|
||||
- Утилита для удаления образов/инодов (vitastor-cli rm)
|
||||
- Пакеты для Debian и CentOS
|
||||
- Статистика операций ввода/вывода и занятого места в разрезе инодов
|
||||
- Именование инодов через хранение их метаданных в etcd
|
||||
@@ -49,6 +49,7 @@ Vitastor на данный момент находится в статусе п
|
||||
- Поддержка RDMA/RoCEv2 через libibverbs
|
||||
- CSI-плагин для Kubernetes
|
||||
- Базовая поддержка OpenStack: драйвер Cinder, патчи для Nova и libvirt
|
||||
- Слияние снапшотов (vitastor-cli {snap-rm,flatten,merge})
|
||||
|
||||
## Планы развития
|
||||
|
||||
@@ -491,10 +492,10 @@ qemu-system-x86_64 -enable-kvm -m 1024
|
||||
|
||||
### Удалить образ
|
||||
|
||||
Используйте утилиту vitastor-rm. Например:
|
||||
Используйте утилиту vitastor-cli rm. Например:
|
||||
|
||||
```
|
||||
vitastor-rm --etcd_address 10.115.0.10:2379/v3 --pool 1 --inode 1 --parallel_osds 16 --iodepth 32
|
||||
vitastor-cli rm --etcd_address 10.115.0.10:2379/v3 --pool 1 --inode 1 --parallel_osds 16 --iodepth 32
|
||||
```
|
||||
|
||||
### NBD
|
||||
|
@@ -34,7 +34,7 @@ breaking changes in the future. However, the following is implemented:
|
||||
- QEMU driver (built out-of-tree)
|
||||
- Loadable fio engine for benchmarks (also built out-of-tree)
|
||||
- NBD proxy for kernel mounts
|
||||
- Inode removal tool (vitastor-rm)
|
||||
- Inode removal tool (vitastor-cli rm)
|
||||
- Packaging for Debian and CentOS
|
||||
- Per-inode I/O and space usage statistics
|
||||
- Inode metadata storage in etcd
|
||||
@@ -43,6 +43,7 @@ breaking changes in the future. However, the following is implemented:
|
||||
- RDMA/RoCEv2 support via libibverbs
|
||||
- CSI plugin for Kubernetes
|
||||
- Basic OpenStack support: Cinder driver, Nova and libvirt patches
|
||||
- Snapshot merge tool (vitastor-cli {snap-rm,flatten,merge})
|
||||
|
||||
## Roadmap
|
||||
|
||||
|
@@ -1,4 +1,4 @@
|
||||
VERSION ?= v0.6.5
|
||||
VERSION ?= v0.6.6
|
||||
|
||||
all: build push
|
||||
|
||||
|
@@ -49,7 +49,7 @@ spec:
|
||||
capabilities:
|
||||
add: ["SYS_ADMIN"]
|
||||
allowPrivilegeEscalation: true
|
||||
image: vitalif/vitastor-csi:v0.6.5
|
||||
image: vitalif/vitastor-csi:v0.6.6
|
||||
args:
|
||||
- "--node=$(NODE_ID)"
|
||||
- "--endpoint=$(CSI_ENDPOINT)"
|
||||
|
@@ -116,7 +116,7 @@ spec:
|
||||
privileged: true
|
||||
capabilities:
|
||||
add: ["SYS_ADMIN"]
|
||||
image: vitalif/vitastor-csi:v0.6.5
|
||||
image: vitalif/vitastor-csi:v0.6.6
|
||||
args:
|
||||
- "--node=$(NODE_ID)"
|
||||
- "--endpoint=$(CSI_ENDPOINT)"
|
||||
|
@@ -5,7 +5,7 @@ package vitastor
|
||||
|
||||
const (
|
||||
vitastorCSIDriverName = "csi.vitastor.io"
|
||||
vitastorCSIDriverVersion = "0.6.5"
|
||||
vitastorCSIDriverVersion = "0.6.6"
|
||||
)
|
||||
|
||||
// Config struct fills the parameters of request or user input
|
||||
|
@@ -354,9 +354,9 @@ func (cs *ControllerServer) DeleteVolume(ctx context.Context, req *csi.DeleteVol
|
||||
return nil, status.Error(codes.Internal, "invalid "+inodeCfgKey+" key in etcd: "+err.Error())
|
||||
}
|
||||
|
||||
// Delete inode data by invoking vitastor-rm
|
||||
// Delete inode data by invoking vitastor-cli
|
||||
args := []string{
|
||||
"--etcd_address", strings.Join(etcdUrl, ","),
|
||||
"rm", "--etcd_address", strings.Join(etcdUrl, ","),
|
||||
"--pool", fmt.Sprintf("%d", idx.PoolId),
|
||||
"--inode", fmt.Sprintf("%d", idx.Id),
|
||||
}
|
||||
@@ -364,7 +364,7 @@ func (cs *ControllerServer) DeleteVolume(ctx context.Context, req *csi.DeleteVol
|
||||
{
|
||||
args = append(args, "--config_path", ctxVars["configPath"])
|
||||
}
|
||||
c := exec.Command("/usr/bin/vitastor-rm", args...)
|
||||
c := exec.Command("/usr/bin/vitastor-cli", args...)
|
||||
var stderr bytes.Buffer
|
||||
c.Stdout = nil
|
||||
c.Stderr = &stderr
|
||||
@@ -372,7 +372,7 @@ func (cs *ControllerServer) DeleteVolume(ctx context.Context, req *csi.DeleteVol
|
||||
stderrStr := string(stderr.Bytes())
|
||||
if (err != nil)
|
||||
{
|
||||
klog.Errorf("vitastor-rm failed: %s, status %s\n", stderrStr, err)
|
||||
klog.Errorf("vitastor-cli rm failed: %s, status %s\n", stderrStr, err)
|
||||
return nil, status.Error(codes.Internal, stderrStr+" (status "+err.Error()+")")
|
||||
}
|
||||
|
||||
|
2
debian/changelog
vendored
2
debian/changelog
vendored
@@ -1,4 +1,4 @@
|
||||
vitastor (0.6.5-1) unstable; urgency=medium
|
||||
vitastor (0.6.6-1) unstable; urgency=medium
|
||||
|
||||
* RDMA support
|
||||
* Bugfixes
|
||||
|
12
debian/vitastor.Dockerfile
vendored
12
debian/vitastor.Dockerfile
vendored
@@ -40,10 +40,10 @@ RUN set -e -x; \
|
||||
mkdir -p /root/packages/vitastor-$REL; \
|
||||
rm -rf /root/packages/vitastor-$REL/*; \
|
||||
cd /root/packages/vitastor-$REL; \
|
||||
cp -r /root/vitastor vitastor-0.6.5; \
|
||||
ln -s /root/packages/qemu-$REL/qemu-*/ vitastor-0.6.5/qemu; \
|
||||
ln -s /root/fio-build/fio-*/ vitastor-0.6.5/fio; \
|
||||
cd vitastor-0.6.5; \
|
||||
cp -r /root/vitastor vitastor-0.6.6; \
|
||||
ln -s /root/packages/qemu-$REL/qemu-*/ vitastor-0.6.6/qemu; \
|
||||
ln -s /root/fio-build/fio-*/ vitastor-0.6.6/fio; \
|
||||
cd vitastor-0.6.6; \
|
||||
FIO=$(head -n1 fio/debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
|
||||
QEMU=$(head -n1 qemu/debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
|
||||
sh copy-qemu-includes.sh; \
|
||||
@@ -59,8 +59,8 @@ RUN set -e -x; \
|
||||
echo "dep:fio=$FIO" > debian/substvars; \
|
||||
echo "dep:qemu=$QEMU" >> debian/substvars; \
|
||||
cd /root/packages/vitastor-$REL; \
|
||||
tar --sort=name --mtime='2020-01-01' --owner=0 --group=0 --exclude=debian -cJf vitastor_0.6.5.orig.tar.xz vitastor-0.6.5; \
|
||||
cd vitastor-0.6.5; \
|
||||
tar --sort=name --mtime='2020-01-01' --owner=0 --group=0 --exclude=debian -cJf vitastor_0.6.6.orig.tar.xz vitastor-0.6.6; \
|
||||
cd vitastor-0.6.6; \
|
||||
V=$(head -n1 debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
|
||||
DEBFULLNAME="Vitaliy Filippov <vitalif@yourcmc.ru>" dch -D $REL -v "$V""$REL" "Rebuild for $REL"; \
|
||||
DEB_BUILD_OPTIONS=nocheck dpkg-buildpackage --jobs=auto -sa; \
|
||||
|
9
docker/Dockerfile
Normal file
9
docker/Dockerfile
Normal file
@@ -0,0 +1,9 @@
|
||||
# Build Docker image with Vitastor packages
|
||||
|
||||
FROM debian:bullseye
|
||||
|
||||
ADD vitastor.list /etc/apt/sources.list.d
|
||||
ADD vitastor.gpg /etc/apt/trusted.gpg.d
|
||||
ADD vitastor.pref /etc/apt/preferences.d
|
||||
ADD apt.conf /etc/apt/
|
||||
RUN apt-get update && apt-get -y install vitastor qemu-system-x86 qemu-system-common && apt-get clean
|
1
docker/apt.conf
Normal file
1
docker/apt.conf
Normal file
@@ -0,0 +1 @@
|
||||
APT::Install-Recommends false;
|
BIN
docker/vitastor.gpg
Normal file
BIN
docker/vitastor.gpg
Normal file
Binary file not shown.
1
docker/vitastor.list
Normal file
1
docker/vitastor.list
Normal file
@@ -0,0 +1 @@
|
||||
deb http://vitastor.io/debian bullseye main
|
3
docker/vitastor.pref
Normal file
3
docker/vitastor.pref
Normal file
@@ -0,0 +1,3 @@
|
||||
Package: *
|
||||
Pin: origin "vitastor.io"
|
||||
Pin-Priority: 1000
|
@@ -50,7 +50,7 @@ from cinder.volume import configuration
|
||||
from cinder.volume import driver
|
||||
from cinder.volume import volume_utils
|
||||
|
||||
VERSION = '0.6.5'
|
||||
VERSION = '0.6.6'
|
||||
|
||||
LOG = logging.getLogger(__name__)
|
||||
|
||||
@@ -514,7 +514,7 @@ class VitastorDriver(driver.CloneableImageVD,
|
||||
# Clear data
|
||||
for kv in layers:
|
||||
args = [
|
||||
'vitastor-rm', '--pool', str(kv['value']['pool_id']),
|
||||
'vitastor-cli', 'rm', '--pool', str(kv['value']['pool_id']),
|
||||
'--inode', str(kv['value']['id']), '--progress', '0',
|
||||
*(self._vitastor_args())
|
||||
]
|
||||
|
@@ -48,4 +48,4 @@ FIO=`rpm -qi fio | perl -e 'while(<>) { /^Epoch[\s:]+(\S+)/ && print "$1:"; /^Ve
|
||||
QEMU=`rpm -qi qemu qemu-kvm | perl -e 'while(<>) { /^Epoch[\s:]+(\S+)/ && print "$1:"; /^Version[\s:]+(\S+)/ && print $1; /^Release[\s:]+(\S+)/ && print "-$1"; }'`
|
||||
perl -i -pe 's/(Requires:\s*fio)([^\n]+)?/$1 = '$FIO'/' $VITASTOR/rpm/vitastor-el$EL.spec
|
||||
perl -i -pe 's/(Requires:\s*qemu(?:-kvm)?)([^\n]+)?/$1 = '$QEMU'/' $VITASTOR/rpm/vitastor-el$EL.spec
|
||||
tar --transform 's#^#vitastor-0.6.5/#' --exclude 'rpm/*.rpm' -czf $VITASTOR/../vitastor-0.6.5$(rpm --eval '%dist').tar.gz *
|
||||
tar --transform 's#^#vitastor-0.6.6/#' --exclude 'rpm/*.rpm' -czf $VITASTOR/../vitastor-0.6.6$(rpm --eval '%dist').tar.gz *
|
||||
|
@@ -38,7 +38,7 @@ ADD . /root/vitastor
|
||||
RUN set -e; \
|
||||
cd /root/vitastor/rpm; \
|
||||
sh build-tarball.sh; \
|
||||
cp /root/vitastor-0.6.5.el7.tar.gz ~/rpmbuild/SOURCES; \
|
||||
cp /root/vitastor-0.6.6.el7.tar.gz ~/rpmbuild/SOURCES; \
|
||||
cp vitastor-el7.spec ~/rpmbuild/SPECS/vitastor.spec; \
|
||||
cd ~/rpmbuild/SPECS/; \
|
||||
rpmbuild -ba vitastor.spec; \
|
||||
|
@@ -1,11 +1,11 @@
|
||||
Name: vitastor
|
||||
Version: 0.6.5
|
||||
Version: 0.6.6
|
||||
Release: 1%{?dist}
|
||||
Summary: Vitastor, a fast software-defined clustered block storage
|
||||
|
||||
License: Vitastor Network Public License 1.1
|
||||
URL: https://vitastor.io/
|
||||
Source0: vitastor-0.6.5.el7.tar.gz
|
||||
Source0: vitastor-0.6.6.el7.tar.gz
|
||||
|
||||
BuildRequires: liburing-devel >= 0.6
|
||||
BuildRequires: gperftools-devel
|
||||
@@ -57,6 +57,7 @@ cp -r mon %buildroot/usr/lib/vitastor/mon
|
||||
%_bindir/vitastor-dump-journal
|
||||
%_bindir/vitastor-nbd
|
||||
%_bindir/vitastor-osd
|
||||
%_bindir/vitastor-cli
|
||||
%_bindir/vitastor-rm
|
||||
%_libdir/qemu-kvm/block-vitastor.so
|
||||
%_libdir/libfio_vitastor.so
|
||||
|
@@ -36,7 +36,7 @@ ADD . /root/vitastor
|
||||
RUN set -e; \
|
||||
cd /root/vitastor/rpm; \
|
||||
sh build-tarball.sh; \
|
||||
cp /root/vitastor-0.6.5.el8.tar.gz ~/rpmbuild/SOURCES; \
|
||||
cp /root/vitastor-0.6.6.el8.tar.gz ~/rpmbuild/SOURCES; \
|
||||
cp vitastor-el8.spec ~/rpmbuild/SPECS/vitastor.spec; \
|
||||
cd ~/rpmbuild/SPECS/; \
|
||||
rpmbuild -ba vitastor.spec; \
|
||||
|
@@ -1,11 +1,11 @@
|
||||
Name: vitastor
|
||||
Version: 0.6.5
|
||||
Version: 0.6.6
|
||||
Release: 1%{?dist}
|
||||
Summary: Vitastor, a fast software-defined clustered block storage
|
||||
|
||||
License: Vitastor Network Public License 1.1
|
||||
URL: https://vitastor.io/
|
||||
Source0: vitastor-0.6.5.el8.tar.gz
|
||||
Source0: vitastor-0.6.6.el8.tar.gz
|
||||
|
||||
BuildRequires: liburing-devel >= 0.6
|
||||
BuildRequires: gperftools-devel
|
||||
@@ -54,6 +54,7 @@ cp -r mon %buildroot/usr/lib/vitastor
|
||||
%_bindir/vitastor-dump-journal
|
||||
%_bindir/vitastor-nbd
|
||||
%_bindir/vitastor-osd
|
||||
%_bindir/vitastor-cli
|
||||
%_bindir/vitastor-rm
|
||||
%_libdir/qemu-kvm/block-vitastor.so
|
||||
%_libdir/libfio_vitastor.so
|
||||
|
@@ -15,8 +15,8 @@ if("${CMAKE_INSTALL_PREFIX}" MATCHES "^/usr/local/?$")
|
||||
set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}")
|
||||
endif()
|
||||
|
||||
add_definitions(-DVERSION="0.6.5")
|
||||
add_definitions(-Wall -Wno-sign-compare -Wno-comment -Wno-parentheses -Wno-pointer-arith -I ${CMAKE_SOURCE_DIR}/src)
|
||||
add_definitions(-DVERSION="0.6.6")
|
||||
add_definitions(-Wall -Wno-sign-compare -Wno-comment -Wno-parentheses -Wno-pointer-arith -fdiagnostics-color=always -I ${CMAKE_SOURCE_DIR}/src)
|
||||
if (${WITH_ASAN})
|
||||
add_definitions(-fsanitize=address -fno-omit-frame-pointer)
|
||||
add_link_options(-fsanitize=address -fno-omit-frame-pointer)
|
||||
@@ -36,6 +36,11 @@ string(REGEX REPLACE "([\\/\\-]D) *NDEBUG" "" CMAKE_C_FLAGS_RELEASE "${CMAKE_C_F
|
||||
string(REGEX REPLACE "([\\/\\-]D) *NDEBUG" "" CMAKE_C_FLAGS_MINSIZEREL "${CMAKE_C_FLAGS_MINSIZEREL}")
|
||||
string(REGEX REPLACE "([\\/\\-]D) *NDEBUG" "" CMAKE_C_FLAGS_RELWITHDEBINFO "${CMAKE_C_FLAGS_RELWITHDEBINFO}")
|
||||
|
||||
macro(install_symlink filepath sympath)
|
||||
install(CODE "execute_process(COMMAND ${CMAKE_COMMAND} -E create_symlink ${filepath} ${sympath})")
|
||||
install(CODE "message(\"-- Created symlink: ${sympath} -> ${filepath}\")")
|
||||
endmacro(install_symlink)
|
||||
|
||||
find_package(PkgConfig)
|
||||
pkg_check_modules(LIBURING REQUIRED liburing)
|
||||
if (${WITH_QEMU})
|
||||
@@ -116,6 +121,7 @@ endif (${WITH_FIO})
|
||||
# libvitastor_client.so
|
||||
add_library(vitastor_client SHARED
|
||||
cluster_client.cpp
|
||||
cluster_client_list.cpp
|
||||
vitastor_c.cpp
|
||||
)
|
||||
set_target_properties(vitastor_client PROPERTIES PUBLIC_HEADER "vitastor_c.h")
|
||||
@@ -145,11 +151,11 @@ target_link_libraries(vitastor-nbd
|
||||
vitastor_client
|
||||
)
|
||||
|
||||
# vitastor-rm
|
||||
add_executable(vitastor-rm
|
||||
rm_inode.cpp
|
||||
# vitastor-cli
|
||||
add_executable(vitastor-cli
|
||||
cli.cpp cli_flatten.cpp cli_merge.cpp cli_rm.cpp cli_snap_rm.cpp
|
||||
)
|
||||
target_link_libraries(vitastor-rm
|
||||
target_link_libraries(vitastor-cli
|
||||
vitastor_client
|
||||
)
|
||||
|
||||
@@ -220,7 +226,7 @@ target_link_libraries(test_cas
|
||||
# test_cluster_client
|
||||
add_executable(test_cluster_client
|
||||
test_cluster_client.cpp
|
||||
pg_states.cpp osd_ops.cpp cluster_client.cpp msgr_op.cpp mock/messenger.cpp msgr_stop.cpp
|
||||
pg_states.cpp osd_ops.cpp cluster_client.cpp cluster_client_list.cpp msgr_op.cpp mock/messenger.cpp msgr_stop.cpp
|
||||
etcd_state_client.cpp timerfd_manager.cpp ../json11/json11.cpp
|
||||
)
|
||||
target_compile_definitions(test_cluster_client PUBLIC -D__MOCK__)
|
||||
@@ -234,7 +240,8 @@ target_include_directories(test_cluster_client PUBLIC ${CMAKE_SOURCE_DIR}/src/mo
|
||||
|
||||
### Install
|
||||
|
||||
install(TARGETS vitastor-osd vitastor-dump-journal vitastor-nbd vitastor-rm RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
|
||||
install(TARGETS vitastor-osd vitastor-dump-journal vitastor-nbd vitastor-cli RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
|
||||
install_symlink(${CMAKE_INSTALL_BINDIR}/vitastor-rm vitastor-cli)
|
||||
install(
|
||||
TARGETS vitastor_blk vitastor_client
|
||||
LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
|
||||
|
@@ -48,6 +48,11 @@ std::map<uint64_t, uint64_t> & blockstore_t::get_inode_space_stats()
|
||||
return impl->inode_space_stats;
|
||||
}
|
||||
|
||||
void blockstore_t::dump_diagnostics()
|
||||
{
|
||||
return impl->dump_diagnostics();
|
||||
}
|
||||
|
||||
uint32_t blockstore_t::get_block_size()
|
||||
{
|
||||
return impl->get_block_size();
|
||||
|
@@ -186,6 +186,9 @@ public:
|
||||
// Get per-inode space usage statistics
|
||||
std::map<uint64_t, uint64_t> & get_inode_space_stats();
|
||||
|
||||
// Print diagnostics to stdout
|
||||
void dump_diagnostics();
|
||||
|
||||
// FIXME rename to object_size
|
||||
uint32_t get_block_size();
|
||||
uint64_t get_block_count();
|
||||
|
@@ -182,6 +182,75 @@ void journal_flusher_t::release_trim()
|
||||
trim_wanted--;
|
||||
}
|
||||
|
||||
void journal_flusher_t::dump_diagnostics()
|
||||
{
|
||||
const char *unflushable_type = "";
|
||||
obj_ver_id unflushable = { 0 };
|
||||
// Try to find out if there is a flushable object for information
|
||||
for (object_id cur_oid: flush_queue)
|
||||
{
|
||||
obj_ver_id cur = { .oid = cur_oid, .version = flush_versions[cur_oid] };
|
||||
auto dirty_end = bs->dirty_db.find(cur);
|
||||
if (dirty_end == bs->dirty_db.end())
|
||||
{
|
||||
// Already flushed
|
||||
continue;
|
||||
}
|
||||
auto repeat_it = sync_to_repeat.find(cur.oid);
|
||||
if (repeat_it != sync_to_repeat.end())
|
||||
{
|
||||
// Someone is already flushing it
|
||||
unflushable_type = "locked,";
|
||||
unflushable = cur;
|
||||
break;
|
||||
}
|
||||
if (dirty_end->second.journal_sector >= bs->journal.dirty_start &&
|
||||
(bs->journal.dirty_start >= bs->journal.used_start ||
|
||||
dirty_end->second.journal_sector < bs->journal.used_start))
|
||||
{
|
||||
// Object is more recent than possible to flush
|
||||
bool found = try_find_older(dirty_end, cur);
|
||||
if (!found)
|
||||
{
|
||||
unflushable_type = "dirty,";
|
||||
unflushable = cur;
|
||||
break;
|
||||
}
|
||||
}
|
||||
unflushable_type = "ok,";
|
||||
unflushable = cur;
|
||||
break;
|
||||
}
|
||||
printf(
|
||||
"Flusher: queued=%ld first=%s%lx:%lx trim_wanted=%d dequeuing=%d trimming=%d cur=%d target=%d active=%d syncing=%d\n",
|
||||
flush_queue.size(), unflushable_type, unflushable.oid.inode, unflushable.oid.stripe,
|
||||
trim_wanted, dequeuing, trimming, cur_flusher_count, target_flusher_count,
|
||||
active_flushers, syncing_flushers
|
||||
);
|
||||
}
|
||||
|
||||
bool journal_flusher_t::try_find_older(std::map<obj_ver_id, dirty_entry>::iterator & dirty_end, obj_ver_id & cur)
|
||||
{
|
||||
bool found = false;
|
||||
while (dirty_end != bs->dirty_db.begin())
|
||||
{
|
||||
dirty_end--;
|
||||
if (dirty_end->first.oid != cur.oid)
|
||||
{
|
||||
break;
|
||||
}
|
||||
if (!(dirty_end->second.journal_sector >= bs->journal.dirty_start &&
|
||||
(bs->journal.dirty_start >= bs->journal.used_start ||
|
||||
dirty_end->second.journal_sector < bs->journal.used_start)))
|
||||
{
|
||||
found = true;
|
||||
cur.version = dirty_end->first.version;
|
||||
break;
|
||||
}
|
||||
}
|
||||
return found;
|
||||
}
|
||||
|
||||
#define await_sqe(label) \
|
||||
resume_##label:\
|
||||
sqe = bs->get_sqe();\
|
||||
@@ -286,30 +355,15 @@ stop_flusher:
|
||||
// And it may even block writes if we don't flush the older version
|
||||
// (if it's in the beginning of the journal)...
|
||||
// So first try to find an older version of the same object to flush.
|
||||
bool found = false;
|
||||
while (dirty_end != bs->dirty_db.begin())
|
||||
{
|
||||
dirty_end--;
|
||||
if (dirty_end->first.oid != cur.oid)
|
||||
{
|
||||
break;
|
||||
}
|
||||
if (!(dirty_end->second.journal_sector >= bs->journal.dirty_start &&
|
||||
(bs->journal.dirty_start >= bs->journal.used_start ||
|
||||
dirty_end->second.journal_sector < bs->journal.used_start)))
|
||||
{
|
||||
found = true;
|
||||
cur.version = dirty_end->first.version;
|
||||
break;
|
||||
}
|
||||
}
|
||||
bool found = flusher->try_find_older(dirty_end, cur);
|
||||
if (!found)
|
||||
{
|
||||
// Try other objects
|
||||
flusher->sync_to_repeat.erase(cur.oid);
|
||||
int search_left = flusher->flush_queue.size() - 1;
|
||||
#ifdef BLOCKSTORE_DEBUG
|
||||
printf("Flusher overran writers (dirty_start=%08lx) - searching for older flushes (%d left)\n", bs->journal.dirty_start, search_left);
|
||||
printf("Flusher overran writers (%lx:%lx v%lu, dirty_start=%08lx) - searching for older flushes (%d left)\n",
|
||||
cur.oid.inode, cur.oid.stripe, cur.version, bs->journal.dirty_start, search_left);
|
||||
#endif
|
||||
while (search_left > 0)
|
||||
{
|
||||
@@ -332,7 +386,12 @@ stop_flusher:
|
||||
else
|
||||
{
|
||||
repeat_it = flusher->sync_to_repeat.find(cur.oid);
|
||||
if (repeat_it == flusher->sync_to_repeat.end())
|
||||
if (repeat_it != flusher->sync_to_repeat.end())
|
||||
{
|
||||
if (repeat_it->second < cur.version)
|
||||
repeat_it->second = cur.version;
|
||||
}
|
||||
else
|
||||
{
|
||||
flusher->sync_to_repeat[cur.oid] = 0;
|
||||
break;
|
||||
|
@@ -97,6 +97,9 @@ class journal_flusher_t
|
||||
std::map<uint64_t, meta_sector_t> meta_sectors;
|
||||
std::deque<object_id> flush_queue;
|
||||
std::map<object_id, uint64_t> flush_versions;
|
||||
|
||||
bool try_find_older(std::map<obj_ver_id, dirty_entry>::iterator & dirty_end, obj_ver_id & cur);
|
||||
|
||||
public:
|
||||
journal_flusher_t(blockstore_impl_t *bs);
|
||||
~journal_flusher_t();
|
||||
@@ -108,4 +111,5 @@ public:
|
||||
void enqueue_flush(obj_ver_id oid);
|
||||
void unshift_flush(obj_ver_id oid, bool force);
|
||||
void remove_flush(object_id oid);
|
||||
void dump_diagnostics();
|
||||
};
|
||||
|
@@ -595,3 +595,9 @@ void blockstore_impl_t::process_list(blockstore_op_t *op)
|
||||
op->buf = stable;
|
||||
FINISH_OP(op);
|
||||
}
|
||||
|
||||
void blockstore_impl_t::dump_diagnostics()
|
||||
{
|
||||
journal.dump_diagnostics();
|
||||
flusher->dump_diagnostics();
|
||||
}
|
||||
|
@@ -361,6 +361,9 @@ public:
|
||||
// Space usage statistics
|
||||
std::map<uint64_t, uint64_t> inode_space_stats;
|
||||
|
||||
// Print diagnostics to stdout
|
||||
void dump_diagnostics();
|
||||
|
||||
inline uint32_t get_block_size() { return block_size; }
|
||||
inline uint64_t get_block_count() { return block_count; }
|
||||
inline uint64_t get_free_block_count() { return data_alloc->get_free_count(); }
|
||||
|
@@ -218,3 +218,19 @@ uint64_t journal_t::get_trim_pos()
|
||||
// Can't trim journal
|
||||
return used_start;
|
||||
}
|
||||
|
||||
void journal_t::dump_diagnostics()
|
||||
{
|
||||
auto journal_used_it = used_sectors.lower_bound(used_start);
|
||||
if (journal_used_it == used_sectors.end())
|
||||
{
|
||||
// Journal is cleared to its end, restart from the beginning
|
||||
journal_used_it = used_sectors.begin();
|
||||
}
|
||||
printf(
|
||||
"Journal: used_start=%08lx next_free=%08lx dirty_start=%08lx trim_to=%08lx trim_to_refs=%ld\n",
|
||||
used_start, next_free, dirty_start,
|
||||
journal_used_it == used_sectors.end() ? 0 : journal_used_it->first,
|
||||
journal_used_it == used_sectors.end() ? 0 : journal_used_it->second
|
||||
);
|
||||
}
|
||||
|
@@ -180,6 +180,7 @@ struct journal_t
|
||||
~journal_t();
|
||||
bool trim();
|
||||
uint64_t get_trim_pos();
|
||||
void dump_diagnostics();
|
||||
inline bool entry_fits(int size)
|
||||
{
|
||||
return !(block_size - in_sector_pos < size ||
|
||||
|
@@ -478,15 +478,15 @@ resume_2:
|
||||
}
|
||||
resume_4:
|
||||
// Switch object state
|
||||
#ifdef BLOCKSTORE_DEBUG
|
||||
printf("Ack write %lx:%lx v%lu = state 0x%x\n", op->oid.inode, op->oid.stripe, op->version, dirty_it->second.state);
|
||||
#endif
|
||||
{
|
||||
auto dirty_it = dirty_db.find((obj_ver_id){
|
||||
.oid = op->oid,
|
||||
.version = op->version,
|
||||
});
|
||||
assert(dirty_it != dirty_db.end());
|
||||
#ifdef BLOCKSTORE_DEBUG
|
||||
printf("Ack write %lx:%lx v%lu = state 0x%x\n", op->oid.inode, op->oid.stripe, op->version, dirty_it->second.state);
|
||||
#endif
|
||||
bool is_big = (dirty_it->second.state & BS_ST_TYPE_MASK) == BS_ST_BIG_WRITE;
|
||||
bool imm = is_big ? (immediate_commit == IMMEDIATE_ALL) : (immediate_commit != IMMEDIATE_NONE);
|
||||
if (imm)
|
||||
|
251
src/cli.cpp
Normal file
251
src/cli.cpp
Normal file
@@ -0,0 +1,251 @@
|
||||
// Copyright (c) Vitaliy Filippov, 2019+
|
||||
// License: VNPL-1.1 (see README.md for details)
|
||||
|
||||
/**
|
||||
* CLI tool
|
||||
* Currently can (a) remove inodes and (b) merge snapshot/clone layers
|
||||
*/
|
||||
|
||||
#include <vector>
|
||||
#include <algorithm>
|
||||
|
||||
#include "cli.h"
|
||||
#include "epoll_manager.h"
|
||||
#include "cluster_client.h"
|
||||
#include "pg_states.h"
|
||||
#include "base64.h"
|
||||
|
||||
static const char *exe_name = NULL;
|
||||
|
||||
json11::Json::object cli_tool_t::parse_args(int narg, const char *args[])
|
||||
{
|
||||
json11::Json::object cfg;
|
||||
json11::Json::array cmd;
|
||||
cfg["progress"] = "1";
|
||||
for (int i = 1; i < narg; i++)
|
||||
{
|
||||
if (!strcmp(args[i], "-h") || !strcmp(args[i], "--help"))
|
||||
{
|
||||
help();
|
||||
}
|
||||
else if (args[i][0] == '-' && args[i][1] == '-')
|
||||
{
|
||||
const char *opt = args[i]+2;
|
||||
cfg[opt] = !strcmp(opt, "json") || !strcmp(opt, "wait-list") || i == narg-1 ? "1" : args[++i];
|
||||
}
|
||||
else
|
||||
{
|
||||
cmd.push_back(std::string(args[i]));
|
||||
}
|
||||
}
|
||||
if (!cmd.size())
|
||||
{
|
||||
std::string exe(exe_name);
|
||||
if (exe.substr(exe.size()-11) == "vitastor-rm")
|
||||
{
|
||||
cmd.push_back("rm-data");
|
||||
}
|
||||
}
|
||||
cfg["command"] = cmd;
|
||||
return cfg;
|
||||
}
|
||||
|
||||
void cli_tool_t::help()
|
||||
{
|
||||
printf(
|
||||
"Vitastor command-line tool\n"
|
||||
"(c) Vitaliy Filippov, 2019+ (VNPL-1.1)\n\n"
|
||||
"USAGE:\n"
|
||||
"%s rm-data [OPTIONS] --pool <pool> --inode <inode> [--wait-list]\n"
|
||||
" Remove inode data without changing metadata.\n"
|
||||
" --wait-list means first retrieve objects listings and then remove it.\n"
|
||||
" --wait-list requires more memory, but allows to show correct stats.\n"
|
||||
"\n"
|
||||
"%s merge-data [OPTIONS] <from> <to> [--target <target>]\n"
|
||||
" Merge layer data without changing metadata. Merge <from>..<to> to <target>.\n"
|
||||
" <to> must be a child of <from> and <target> may be one of the layers between\n"
|
||||
" <from> and <to>, including <from> and <to>.\n"
|
||||
"\n"
|
||||
"%s flatten [OPTIONS] <layer>\n"
|
||||
" Flatten a layer, i.e. merge data and detach it from parents\n"
|
||||
"\n"
|
||||
"%s rm [OPTIONS] <from> [<to>] [--writers-stopped 1]\n"
|
||||
" Remove <from> or all layers between <from> and <to> (<to> must be a child of <from>),\n"
|
||||
" rebasing all their children accordingly. One of deleted parents may be renamed to one\n"
|
||||
" of children \"to be rebased\", but only if that child itself is readonly or if\n"
|
||||
" --writers-stopped 1 is specified\n"
|
||||
"\n"
|
||||
"OPTIONS (global):\n"
|
||||
" --etcd_address <etcd_address>\n"
|
||||
" --iodepth N Send N operations in parallel to each OSD when possible (default 32)\n"
|
||||
" --parallel_osds M Work with M osds in parallel when possible (default 4)\n"
|
||||
" --progress 1|0 Report progress (default 1)\n"
|
||||
" --cas 1|0 Use online CAS writes when possible (default auto)\n"
|
||||
,
|
||||
exe_name, exe_name, exe_name, exe_name
|
||||
);
|
||||
exit(0);
|
||||
}
|
||||
|
||||
void cli_tool_t::change_parent(inode_t cur, inode_t new_parent)
|
||||
{
|
||||
auto cur_cfg_it = cli->st_cli.inode_config.find(cur);
|
||||
if (cur_cfg_it == cli->st_cli.inode_config.end())
|
||||
{
|
||||
fprintf(stderr, "Inode 0x%lx disappeared\n", cur);
|
||||
exit(1);
|
||||
}
|
||||
inode_config_t new_cfg = cur_cfg_it->second;
|
||||
std::string cur_name = new_cfg.name;
|
||||
std::string cur_cfg_key = base64_encode(cli->st_cli.etcd_prefix+
|
||||
"/config/inode/"+std::to_string(INODE_POOL(cur))+
|
||||
"/"+std::to_string(INODE_NO_POOL(cur)));
|
||||
new_cfg.parent_id = new_parent;
|
||||
json11::Json::object cur_cfg_json = cli->st_cli.serialize_inode_cfg(&new_cfg);
|
||||
waiting++;
|
||||
cli->st_cli.etcd_txn(json11::Json::object {
|
||||
{ "compare", json11::Json::array {
|
||||
json11::Json::object {
|
||||
{ "target", "MOD" },
|
||||
{ "key", cur_cfg_key },
|
||||
{ "result", "LESS" },
|
||||
{ "mod_revision", new_cfg.mod_revision+1 },
|
||||
},
|
||||
} },
|
||||
{ "success", json11::Json::array {
|
||||
json11::Json::object {
|
||||
{ "request_put", json11::Json::object {
|
||||
{ "key", cur_cfg_key },
|
||||
{ "value", base64_encode(json11::Json(cur_cfg_json).dump()) },
|
||||
} }
|
||||
},
|
||||
} },
|
||||
}, ETCD_SLOW_TIMEOUT, [this, new_parent, cur, cur_name](std::string err, json11::Json res)
|
||||
{
|
||||
if (err != "")
|
||||
{
|
||||
fprintf(stderr, "Error changing parent of %s: %s\n", cur_name.c_str(), err.c_str());
|
||||
exit(1);
|
||||
}
|
||||
if (!res["succeeded"].bool_value())
|
||||
{
|
||||
fprintf(stderr, "Inode %s was modified during snapshot deletion\n", cur_name.c_str());
|
||||
exit(1);
|
||||
}
|
||||
if (new_parent)
|
||||
{
|
||||
auto new_parent_it = cli->st_cli.inode_config.find(new_parent);
|
||||
std::string new_parent_name = new_parent_it != cli->st_cli.inode_config.end()
|
||||
? new_parent_it->second.name : "<unknown>";
|
||||
printf(
|
||||
"Parent of layer %s (inode %lu in pool %u) changed to %s (inode %lu in pool %u)\n",
|
||||
cur_name.c_str(), INODE_NO_POOL(cur), INODE_POOL(cur),
|
||||
new_parent_name.c_str(), INODE_NO_POOL(new_parent), INODE_POOL(new_parent)
|
||||
);
|
||||
}
|
||||
else
|
||||
{
|
||||
printf(
|
||||
"Parent of layer %s (inode %lu in pool %u) detached\n",
|
||||
cur_name.c_str(), INODE_NO_POOL(cur), INODE_POOL(cur)
|
||||
);
|
||||
}
|
||||
waiting--;
|
||||
ringloop->wakeup();
|
||||
});
|
||||
}
|
||||
|
||||
inode_config_t* cli_tool_t::get_inode_cfg(const std::string & name)
|
||||
{
|
||||
for (auto & ic: cli->st_cli.inode_config)
|
||||
{
|
||||
if (ic.second.name == name)
|
||||
{
|
||||
return &ic.second;
|
||||
}
|
||||
}
|
||||
fprintf(stderr, "Layer %s not found\n", name.c_str());
|
||||
exit(1);
|
||||
}
|
||||
|
||||
void cli_tool_t::run(json11::Json cfg)
|
||||
{
|
||||
json11::Json::array cmd = cfg["command"].array_items();
|
||||
if (!cmd.size())
|
||||
{
|
||||
fprintf(stderr, "command is missing\n");
|
||||
exit(1);
|
||||
}
|
||||
else if (cmd[0] == "rm-data")
|
||||
{
|
||||
// Delete inode data
|
||||
action_cb = start_rm(cfg);
|
||||
}
|
||||
else if (cmd[0] == "merge-data")
|
||||
{
|
||||
// Merge layer data without affecting metadata
|
||||
action_cb = start_merge(cfg);
|
||||
}
|
||||
else if (cmd[0] == "flatten")
|
||||
{
|
||||
// Merge layer data without affecting metadata
|
||||
action_cb = start_flatten(cfg);
|
||||
}
|
||||
else if (cmd[0] == "rm")
|
||||
{
|
||||
// Remove multiple snapshots and rebase their children
|
||||
action_cb = start_snap_rm(cfg);
|
||||
}
|
||||
else
|
||||
{
|
||||
fprintf(stderr, "unknown command: %s\n", cmd[0].string_value().c_str());
|
||||
exit(1);
|
||||
}
|
||||
iodepth = cfg["iodepth"].uint64_value();
|
||||
if (!iodepth)
|
||||
iodepth = 32;
|
||||
parallel_osds = cfg["parallel_osds"].uint64_value();
|
||||
if (!parallel_osds)
|
||||
parallel_osds = 4;
|
||||
log_level = cfg["log_level"].int64_value();
|
||||
progress = cfg["progress"].uint64_value() ? true : false;
|
||||
list_first = cfg["wait-list"].uint64_value() ? true : false;
|
||||
// Create client
|
||||
ringloop = new ring_loop_t(512);
|
||||
epmgr = new epoll_manager_t(ringloop);
|
||||
cli = new cluster_client_t(ringloop, epmgr->tfd, cfg);
|
||||
cli->on_ready([this]()
|
||||
{
|
||||
// Initialize job
|
||||
consumer.loop = [this]()
|
||||
{
|
||||
if (action_cb != NULL)
|
||||
{
|
||||
bool done = action_cb();
|
||||
if (done)
|
||||
{
|
||||
action_cb = NULL;
|
||||
}
|
||||
}
|
||||
ringloop->submit();
|
||||
};
|
||||
ringloop->register_consumer(&consumer);
|
||||
consumer.loop();
|
||||
});
|
||||
// Loop until it completes
|
||||
while (action_cb != NULL)
|
||||
{
|
||||
ringloop->loop();
|
||||
ringloop->wait();
|
||||
}
|
||||
}
|
||||
|
||||
int main(int narg, const char *args[])
|
||||
{
|
||||
setvbuf(stdout, NULL, _IONBF, 0);
|
||||
setvbuf(stderr, NULL, _IONBF, 0);
|
||||
exe_name = args[0];
|
||||
cli_tool_t *p = new cli_tool_t();
|
||||
p->run(cli_tool_t::parse_args(narg, args));
|
||||
return 0;
|
||||
}
|
56
src/cli.h
Normal file
56
src/cli.h
Normal file
@@ -0,0 +1,56 @@
|
||||
// Copyright (c) Vitaliy Filippov, 2019+
|
||||
// License: VNPL-1.1 (see README.md for details)
|
||||
|
||||
// Common CLI tool header
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "json11/json11.hpp"
|
||||
#include "object_id.h"
|
||||
#include "ringloop.h"
|
||||
#include <functional>
|
||||
|
||||
struct rm_inode_t;
|
||||
struct snap_merger_t;
|
||||
struct snap_flattener_t;
|
||||
struct snap_remover_t;
|
||||
|
||||
class epoll_manager_t;
|
||||
class cluster_client_t;
|
||||
struct inode_config_t;
|
||||
|
||||
class cli_tool_t
|
||||
{
|
||||
public:
|
||||
uint64_t iodepth = 0, parallel_osds = 0;
|
||||
bool progress = true;
|
||||
bool list_first = false;
|
||||
int log_level = 0;
|
||||
int mode = 0;
|
||||
|
||||
ring_loop_t *ringloop = NULL;
|
||||
epoll_manager_t *epmgr = NULL;
|
||||
cluster_client_t *cli = NULL;
|
||||
|
||||
int waiting = 0;
|
||||
ring_consumer_t consumer;
|
||||
std::function<bool(void)> action_cb;
|
||||
|
||||
void run(json11::Json cfg);
|
||||
|
||||
void change_parent(inode_t cur, inode_t new_parent);
|
||||
inode_config_t* get_inode_cfg(const std::string & name);
|
||||
|
||||
static json11::Json::object parse_args(int narg, const char *args[]);
|
||||
static void help();
|
||||
|
||||
friend struct rm_inode_t;
|
||||
friend struct snap_merger_t;
|
||||
friend struct snap_flattener_t;
|
||||
friend struct snap_remover_t;
|
||||
|
||||
std::function<bool(void)> start_rm(json11::Json);
|
||||
std::function<bool(void)> start_merge(json11::Json);
|
||||
std::function<bool(void)> start_flatten(json11::Json);
|
||||
std::function<bool(void)> start_snap_rm(json11::Json);
|
||||
};
|
124
src/cli_flatten.cpp
Normal file
124
src/cli_flatten.cpp
Normal file
@@ -0,0 +1,124 @@
|
||||
// Copyright (c) Vitaliy Filippov, 2019+
|
||||
// License: VNPL-1.1 (see README.md for details)
|
||||
|
||||
#include "cli.h"
|
||||
#include "cluster_client.h"
|
||||
|
||||
// Flatten a layer: merge all parents into a layer and break the connection completely
|
||||
struct snap_flattener_t
|
||||
{
|
||||
cli_tool_t *parent;
|
||||
|
||||
// target to flatten
|
||||
std::string target_name;
|
||||
// writers are stopped, we can safely change writable layers
|
||||
bool writers_stopped = false;
|
||||
// use CAS writes (0 = never, 1 = auto, 2 = always)
|
||||
int use_cas = 1;
|
||||
// interval between fsyncs
|
||||
int fsync_interval = 128;
|
||||
|
||||
std::string top_parent_name;
|
||||
inode_t target_id = 0;
|
||||
int state = 0;
|
||||
std::function<bool(void)> merger_cb;
|
||||
|
||||
void get_merge_parents()
|
||||
{
|
||||
// Get all parents of target
|
||||
inode_config_t *target_cfg = parent->get_inode_cfg(target_name);
|
||||
target_id = target_cfg->num;
|
||||
std::vector<inode_t> chain_list;
|
||||
inode_config_t *cur = target_cfg;
|
||||
chain_list.push_back(cur->num);
|
||||
while (cur->parent_id != 0 && cur->parent_id != target_cfg->num)
|
||||
{
|
||||
auto it = parent->cli->st_cli.inode_config.find(cur->parent_id);
|
||||
if (it == parent->cli->st_cli.inode_config.end())
|
||||
{
|
||||
fprintf(stderr, "Parent inode of layer %s (id %ld) not found\n", cur->name.c_str(), cur->parent_id);
|
||||
exit(1);
|
||||
}
|
||||
cur = &it->second;
|
||||
chain_list.push_back(cur->num);
|
||||
}
|
||||
if (cur->parent_id != 0)
|
||||
{
|
||||
fprintf(stderr, "Layer %s has a loop in parents\n", target_name.c_str());
|
||||
exit(1);
|
||||
}
|
||||
top_parent_name = cur->name;
|
||||
}
|
||||
|
||||
bool is_done()
|
||||
{
|
||||
return state == 5;
|
||||
}
|
||||
|
||||
void loop()
|
||||
{
|
||||
if (state == 1)
|
||||
goto resume_1;
|
||||
else if (state == 2)
|
||||
goto resume_2;
|
||||
else if (state == 3)
|
||||
goto resume_3;
|
||||
// Get parent layers
|
||||
get_merge_parents();
|
||||
// Start merger
|
||||
merger_cb = parent->start_merge(json11::Json::object {
|
||||
{ "command", json11::Json::array{ "merge-data", top_parent_name, target_name } },
|
||||
{ "target", target_name },
|
||||
{ "delete-source", false },
|
||||
{ "cas", use_cas },
|
||||
{ "fsync-interval", fsync_interval },
|
||||
});
|
||||
// Wait for it
|
||||
resume_1:
|
||||
while (!merger_cb())
|
||||
{
|
||||
state = 1;
|
||||
return;
|
||||
}
|
||||
merger_cb = NULL;
|
||||
// Change parent
|
||||
parent->change_parent(target_id, 0);
|
||||
// Wait for it to complete
|
||||
state = 2;
|
||||
resume_2:
|
||||
if (parent->waiting > 0)
|
||||
return;
|
||||
state = 3;
|
||||
resume_3:
|
||||
// Done
|
||||
return;
|
||||
}
|
||||
};
|
||||
|
||||
std::function<bool(void)> cli_tool_t::start_flatten(json11::Json cfg)
|
||||
{
|
||||
json11::Json::array cmd = cfg["command"].array_items();
|
||||
auto flattener = new snap_flattener_t();
|
||||
flattener->parent = this;
|
||||
flattener->target_name = cmd.size() > 1 ? cmd[1].string_value() : "";
|
||||
if (flattener->target_name == "")
|
||||
{
|
||||
fprintf(stderr, "Layer to flatten argument is missing\n");
|
||||
exit(1);
|
||||
}
|
||||
flattener->fsync_interval = cfg["fsync-interval"].uint64_value();
|
||||
if (!flattener->fsync_interval)
|
||||
flattener->fsync_interval = 128;
|
||||
if (!cfg["cas"].is_null())
|
||||
flattener->use_cas = cfg["cas"].uint64_value() ? 2 : 0;
|
||||
return [flattener]()
|
||||
{
|
||||
flattener->loop();
|
||||
if (flattener->is_done())
|
||||
{
|
||||
delete flattener;
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
};
|
||||
}
|
583
src/cli_merge.cpp
Normal file
583
src/cli_merge.cpp
Normal file
@@ -0,0 +1,583 @@
|
||||
// Copyright (c) Vitaliy Filippov, 2019+
|
||||
// License: VNPL-1.1 (see README.md for details)
|
||||
|
||||
#include "cli.h"
|
||||
#include "cluster_client.h"
|
||||
#include "cpp-btree/safe_btree_set.h"
|
||||
|
||||
struct snap_rw_op_t
|
||||
{
|
||||
uint64_t offset = 0;
|
||||
void *buf = NULL;
|
||||
cluster_op_t op;
|
||||
int todo = 0;
|
||||
uint32_t start = 0, end = 0;
|
||||
};
|
||||
|
||||
// Layer merge is the base for multiple operations:
|
||||
// 1) Delete snapshot "up" = merge child layer into the parent layer, remove the child
|
||||
// and rename the parent to the child
|
||||
// 2) Delete snapshot "down" = merge parent layer into the child layer and remove the parent
|
||||
// 3) Flatten image = merge parent layers into the child layer and break the connection
|
||||
struct snap_merger_t
|
||||
{
|
||||
cli_tool_t *parent;
|
||||
|
||||
// -- CONFIGURATION --
|
||||
// merge from..to into target (target may be one of from..to)
|
||||
std::string from_name, to_name, target_name;
|
||||
// inode=>rank (bigger rank means child layers)
|
||||
std::map<inode_t,int> sources;
|
||||
// delete merged source inode data during merge
|
||||
bool delete_source = false;
|
||||
// use CAS writes (0 = never, 1 = auto, 2 = always)
|
||||
int use_cas = 1;
|
||||
// don't necessarily delete source data, but perform checks as if we were to do it
|
||||
bool check_delete_source = false;
|
||||
// interval between fsyncs
|
||||
int fsync_interval = 128;
|
||||
|
||||
// -- STATE --
|
||||
inode_t target;
|
||||
int target_rank;
|
||||
bool inside_continue = false;
|
||||
int state = 0;
|
||||
int lists_todo = 0;
|
||||
uint64_t target_block_size = 0;
|
||||
btree::safe_btree_set<uint64_t> merge_offsets;
|
||||
btree::safe_btree_set<uint64_t>::iterator oit;
|
||||
std::map<inode_t, std::vector<uint64_t>> layer_lists;
|
||||
std::map<inode_t, uint64_t> layer_block_size;
|
||||
std::map<inode_t, uint64_t> layer_list_pos;
|
||||
int in_flight = 0;
|
||||
uint64_t last_fsync_offset = 0;
|
||||
uint64_t last_written_offset = 0;
|
||||
int deleted_unsynced = 0;
|
||||
uint64_t processed = 0, to_process = 0;
|
||||
|
||||
void start_merge()
|
||||
{
|
||||
check_delete_source = delete_source || check_delete_source;
|
||||
inode_config_t *from_cfg = parent->get_inode_cfg(from_name);
|
||||
inode_config_t *to_cfg = parent->get_inode_cfg(to_name);
|
||||
inode_config_t *target_cfg = target_name == "" ? from_cfg : parent->get_inode_cfg(target_name);
|
||||
if (to_cfg->num == from_cfg->num)
|
||||
{
|
||||
fprintf(stderr, "Only one layer specified, nothing to merge\n");
|
||||
exit(1);
|
||||
}
|
||||
// Check that to_cfg is actually a child of from_cfg and target_cfg is somewhere between them
|
||||
std::vector<inode_t> chain_list;
|
||||
inode_config_t *cur = to_cfg;
|
||||
chain_list.push_back(cur->num);
|
||||
layer_block_size[cur->num] = get_block_size(cur->num);
|
||||
while (cur->parent_id != from_cfg->num &&
|
||||
cur->parent_id != to_cfg->num &&
|
||||
cur->parent_id != 0)
|
||||
{
|
||||
auto it = parent->cli->st_cli.inode_config.find(cur->parent_id);
|
||||
if (it == parent->cli->st_cli.inode_config.end())
|
||||
{
|
||||
fprintf(stderr, "Parent inode of layer %s (id %ld) not found\n", cur->name.c_str(), cur->parent_id);
|
||||
exit(1);
|
||||
}
|
||||
cur = &it->second;
|
||||
chain_list.push_back(cur->num);
|
||||
layer_block_size[cur->num] = get_block_size(cur->num);
|
||||
}
|
||||
if (cur->parent_id != from_cfg->num)
|
||||
{
|
||||
fprintf(stderr, "Layer %s is not a child of %s\n", to_name.c_str(), from_name.c_str());
|
||||
exit(1);
|
||||
}
|
||||
chain_list.push_back(from_cfg->num);
|
||||
layer_block_size[from_cfg->num] = get_block_size(from_cfg->num);
|
||||
int i = chain_list.size()-1;
|
||||
for (inode_t item: chain_list)
|
||||
{
|
||||
sources[item] = i--;
|
||||
}
|
||||
if (sources.find(target_cfg->num) == sources.end())
|
||||
{
|
||||
fprintf(stderr, "Layer %s is not between %s and %s\n", target_name.c_str(), to_name.c_str(), from_name.c_str());
|
||||
exit(1);
|
||||
}
|
||||
target = target_cfg->num;
|
||||
target_rank = sources.at(target);
|
||||
int to_rank = sources.at(to_cfg->num);
|
||||
bool to_has_children = false;
|
||||
// Check that there are no other inodes dependent on altered layers
|
||||
//
|
||||
// 1) everything between <target> and <to> except <to> is not allowed
|
||||
// to have children other than <to> if <to> is a child of <target>:
|
||||
//
|
||||
// <target> - <layer 3> - <to>
|
||||
// \- <layer 4> <--------X--------- NOT ALLOWED
|
||||
//
|
||||
// 2) everything between <from> and <target>, except <target>, is not allowed
|
||||
// to have children other than <target> if sources are to be deleted after merging:
|
||||
//
|
||||
// <from> - <layer 1> - <target> - <to>
|
||||
// \- <layer 2> <---------X-------- NOT ALLOWED
|
||||
for (auto & ic: parent->cli->st_cli.inode_config)
|
||||
{
|
||||
auto it = sources.find(ic.second.num);
|
||||
if (it == sources.end() && ic.second.parent_id != 0)
|
||||
{
|
||||
it = sources.find(ic.second.parent_id);
|
||||
if (it != sources.end())
|
||||
{
|
||||
int parent_rank = it->second;
|
||||
if (parent_rank < to_rank && (parent_rank >= target_rank || check_delete_source))
|
||||
{
|
||||
fprintf(
|
||||
stderr, "Layers at or above %s, but below %s are not allowed"
|
||||
" to have other children, but %s is a child of %s\n",
|
||||
(check_delete_source ? from_name.c_str() : target_name.c_str()),
|
||||
to_name.c_str(), ic.second.name.c_str(),
|
||||
parent->cli->st_cli.inode_config.at(ic.second.parent_id).name.c_str()
|
||||
);
|
||||
exit(1);
|
||||
}
|
||||
if (parent_rank >= to_rank)
|
||||
{
|
||||
to_has_children = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if ((target_rank < to_rank || to_has_children) && use_cas == 1)
|
||||
{
|
||||
// <to> has children itself, no need for CAS
|
||||
use_cas = 0;
|
||||
}
|
||||
sources.erase(target);
|
||||
printf(
|
||||
"Merging %ld layer(s) into target %s%s (inode %lu in pool %u)\n",
|
||||
sources.size(), target_cfg->name.c_str(),
|
||||
use_cas ? " online (with CAS)" : "", INODE_NO_POOL(target), INODE_POOL(target)
|
||||
);
|
||||
target_block_size = get_block_size(target);
|
||||
}
|
||||
|
||||
uint64_t get_block_size(inode_t inode)
|
||||
{
|
||||
auto & pool_cfg = parent->cli->st_cli.pool_config.at(INODE_POOL(inode));
|
||||
uint64_t pg_data_size = (pool_cfg.scheme == POOL_SCHEME_REPLICATED ? 1 : pool_cfg.pg_size-pool_cfg.parity_chunks);
|
||||
return parent->cli->get_bs_block_size() * pg_data_size;
|
||||
}
|
||||
|
||||
void continue_merge_reent()
|
||||
{
|
||||
if (!inside_continue)
|
||||
{
|
||||
inside_continue = true;
|
||||
continue_merge();
|
||||
inside_continue = false;
|
||||
}
|
||||
}
|
||||
|
||||
bool is_done()
|
||||
{
|
||||
return state == 6;
|
||||
}
|
||||
|
||||
void continue_merge()
|
||||
{
|
||||
if (state == 1)
|
||||
goto resume_1;
|
||||
else if (state == 2)
|
||||
goto resume_2;
|
||||
else if (state == 3)
|
||||
goto resume_3;
|
||||
else if (state == 4)
|
||||
goto resume_4;
|
||||
else if (state == 5)
|
||||
goto resume_5;
|
||||
else if (state == 6)
|
||||
goto resume_6;
|
||||
// Get parents and so on
|
||||
start_merge();
|
||||
// First list lower layers
|
||||
list_layers(true);
|
||||
state = 1;
|
||||
resume_1:
|
||||
while (lists_todo > 0)
|
||||
{
|
||||
// Wait for lists
|
||||
return;
|
||||
}
|
||||
if (merge_offsets.size() > 0)
|
||||
{
|
||||
state = 2;
|
||||
oit = merge_offsets.begin();
|
||||
processed = 0;
|
||||
to_process = merge_offsets.size();
|
||||
resume_2:
|
||||
// Then remove blocks already filled in target by issuing zero-length reads and checking bitmaps
|
||||
while (in_flight < parent->iodepth*parent->parallel_osds && oit != merge_offsets.end())
|
||||
{
|
||||
in_flight++;
|
||||
check_if_full(*oit);
|
||||
oit++;
|
||||
processed++;
|
||||
if (parent->progress && !(processed % 128))
|
||||
{
|
||||
printf("\rFiltering target blocks: %lu/%lu", processed, to_process);
|
||||
}
|
||||
}
|
||||
if (in_flight > 0 || oit != merge_offsets.end())
|
||||
{
|
||||
// Wait until reads finish
|
||||
return;
|
||||
}
|
||||
if (parent->progress)
|
||||
{
|
||||
printf("\r%lu full blocks of target filtered out\n", to_process-merge_offsets.size());
|
||||
}
|
||||
}
|
||||
state = 3;
|
||||
resume_3:
|
||||
// Then list upper layers
|
||||
list_layers(false);
|
||||
state = 4;
|
||||
resume_4:
|
||||
while (lists_todo > 0)
|
||||
{
|
||||
// Wait for lists
|
||||
return;
|
||||
}
|
||||
state = 5;
|
||||
processed = 0;
|
||||
to_process = merge_offsets.size();
|
||||
oit = merge_offsets.begin();
|
||||
resume_5:
|
||||
// Now read, overwrite and optionally delete offsets one by one
|
||||
while (in_flight < parent->iodepth*parent->parallel_osds && oit != merge_offsets.end())
|
||||
{
|
||||
in_flight++;
|
||||
read_and_write(*oit);
|
||||
oit++;
|
||||
processed++;
|
||||
if (parent->progress && !(processed % 128))
|
||||
{
|
||||
printf("\rOverwriting blocks: %lu/%lu", processed, to_process);
|
||||
}
|
||||
}
|
||||
if (in_flight > 0 || oit != merge_offsets.end())
|
||||
{
|
||||
// Wait until overwrites finish
|
||||
return;
|
||||
}
|
||||
if (parent->progress)
|
||||
{
|
||||
printf("\rOverwriting blocks: %lu/%lu\n", to_process, to_process);
|
||||
}
|
||||
// Done
|
||||
printf("Done, layers from %s to %s merged into %s\n", from_name.c_str(), to_name.c_str(), target_name.c_str());
|
||||
state = 6;
|
||||
resume_6:
|
||||
return;
|
||||
}
|
||||
|
||||
void list_layers(bool lower)
|
||||
{
|
||||
for (auto & sp: sources)
|
||||
{
|
||||
inode_t src = sp.first;
|
||||
if (lower ? (sp.second < target_rank) : (sp.second > target_rank))
|
||||
{
|
||||
lists_todo++;
|
||||
inode_list_t* lst = parent->cli->list_inode_start(src, [this, src](
|
||||
inode_list_t *lst, std::set<object_id>&& objects, pg_num_t pg_num, osd_num_t primary_osd, int status)
|
||||
{
|
||||
uint64_t layer_block = layer_block_size.at(src);
|
||||
for (object_id obj: objects)
|
||||
{
|
||||
merge_offsets.insert(obj.stripe - obj.stripe % target_block_size);
|
||||
for (int i = target_block_size; i < layer_block; i += target_block_size)
|
||||
{
|
||||
merge_offsets.insert(obj.stripe - obj.stripe % target_block_size + i);
|
||||
}
|
||||
}
|
||||
if (delete_source)
|
||||
{
|
||||
// Also store individual lists
|
||||
auto & layer_list = layer_lists[src];
|
||||
int pos = layer_list.size();
|
||||
layer_list.resize(pos + objects.size());
|
||||
for (object_id obj: objects)
|
||||
{
|
||||
layer_list[pos++] = obj.stripe;
|
||||
}
|
||||
}
|
||||
if (status & INODE_LIST_DONE)
|
||||
{
|
||||
auto & name = parent->cli->st_cli.inode_config.at(src).name;
|
||||
printf("Got listing of layer %s (inode %lu in pool %u)\n", name.c_str(), INODE_NO_POOL(src), INODE_POOL(src));
|
||||
if (delete_source)
|
||||
{
|
||||
// Sort the inode listing
|
||||
std::sort(layer_lists[src].begin(), layer_lists[src].end());
|
||||
}
|
||||
lists_todo--;
|
||||
continue_merge_reent();
|
||||
}
|
||||
else
|
||||
{
|
||||
parent->cli->list_inode_next(lst, 1);
|
||||
}
|
||||
});
|
||||
parent->cli->list_inode_next(lst, parent->parallel_osds);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Check if <offset> is fully written in <target> and remove it from merge_offsets if so
|
||||
void check_if_full(uint64_t offset)
|
||||
{
|
||||
cluster_op_t *op = new cluster_op_t;
|
||||
op->opcode = OSD_OP_READ_BITMAP;
|
||||
op->inode = target;
|
||||
op->offset = offset;
|
||||
op->len = 0;
|
||||
op->callback = [this](cluster_op_t *op)
|
||||
{
|
||||
if (op->retval < 0)
|
||||
{
|
||||
fprintf(stderr, "error reading target bitmap at offset %lx: %s\n", op->offset, strerror(-op->retval));
|
||||
}
|
||||
else
|
||||
{
|
||||
uint64_t bitmap_bytes = target_block_size/parent->cli->get_bs_bitmap_granularity()/8;
|
||||
int i;
|
||||
for (i = 0; i < bitmap_bytes; i++)
|
||||
{
|
||||
if (((uint8_t*)op->bitmap_buf)[i] != 0xff)
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (i == bitmap_bytes)
|
||||
{
|
||||
// full
|
||||
merge_offsets.erase(op->offset);
|
||||
}
|
||||
}
|
||||
delete op;
|
||||
in_flight--;
|
||||
continue_merge_reent();
|
||||
};
|
||||
parent->cli->execute(op);
|
||||
}
|
||||
|
||||
// Read <offset> from <to>, write it to <target> and optionally delete it
|
||||
// from all layers except <target> after fsync'ing
|
||||
void read_and_write(uint64_t offset)
|
||||
{
|
||||
snap_rw_op_t *rwo = new snap_rw_op_t;
|
||||
// Initialize counter to 1 to later allow write_subop() to return immediately
|
||||
// (even though it shouldn't really do that)
|
||||
rwo->todo = 1;
|
||||
rwo->buf = malloc(target_block_size);
|
||||
rwo->offset = offset;
|
||||
rwo_read(rwo);
|
||||
}
|
||||
|
||||
void rwo_read(snap_rw_op_t *rwo)
|
||||
{
|
||||
cluster_op_t *op = &rwo->op;
|
||||
op->opcode = OSD_OP_READ;
|
||||
op->inode = target;
|
||||
op->offset = rwo->offset;
|
||||
op->len = target_block_size;
|
||||
op->iov.push_back(rwo->buf, target_block_size);
|
||||
op->callback = [this, rwo](cluster_op_t *op)
|
||||
{
|
||||
if (op->retval != op->len)
|
||||
{
|
||||
fprintf(stderr, "error reading target at offset %lx: %s\n", op->offset, strerror(-op->retval));
|
||||
exit(1);
|
||||
}
|
||||
next_write(rwo);
|
||||
};
|
||||
parent->cli->execute(op);
|
||||
}
|
||||
|
||||
void next_write(snap_rw_op_t *rwo)
|
||||
{
|
||||
// Write each non-empty range using an individual operation
|
||||
// FIXME: Allow to use single write with "holes" (OSDs don't allow it yet)
|
||||
uint32_t gran = parent->cli->get_bs_bitmap_granularity();
|
||||
uint64_t bitmap_size = target_block_size / gran;
|
||||
while (rwo->end < bitmap_size)
|
||||
{
|
||||
auto bit = ((*(uint8_t*)(rwo->op.bitmap_buf + (rwo->end >> 3))) & (1 << (rwo->end & 0x7)));
|
||||
if (!bit)
|
||||
{
|
||||
if (rwo->end > rwo->start)
|
||||
{
|
||||
// write start->end
|
||||
rwo->todo++;
|
||||
write_subop(rwo, rwo->start*gran, rwo->end*gran, use_cas ? 1+rwo->op.version : 0);
|
||||
rwo->start = rwo->end;
|
||||
if (use_cas)
|
||||
{
|
||||
// Submit one by one if using CAS writes
|
||||
return;
|
||||
}
|
||||
}
|
||||
rwo->start = rwo->end = rwo->end+1;
|
||||
}
|
||||
else
|
||||
{
|
||||
rwo->end++;
|
||||
}
|
||||
}
|
||||
if (rwo->end > rwo->start)
|
||||
{
|
||||
// write start->end
|
||||
rwo->todo++;
|
||||
write_subop(rwo, rwo->start*gran, rwo->end*gran, use_cas ? 1+rwo->op.version : 0);
|
||||
rwo->start = rwo->end;
|
||||
if (use_cas)
|
||||
{
|
||||
return;
|
||||
}
|
||||
}
|
||||
rwo->todo--;
|
||||
// Just in case, if everything is done
|
||||
autofree_op(rwo);
|
||||
}
|
||||
|
||||
void write_subop(snap_rw_op_t *rwo, uint32_t start, uint32_t end, uint64_t version)
|
||||
{
|
||||
cluster_op_t *subop = new cluster_op_t;
|
||||
subop->opcode = OSD_OP_WRITE;
|
||||
subop->inode = target;
|
||||
subop->offset = rwo->offset+start;
|
||||
subop->len = end-start;
|
||||
subop->version = version;
|
||||
subop->flags = OSD_OP_IGNORE_READONLY;
|
||||
subop->iov.push_back(rwo->buf+start, end-start);
|
||||
subop->callback = [this, rwo](cluster_op_t *subop)
|
||||
{
|
||||
rwo->todo--;
|
||||
if (subop->retval != subop->len)
|
||||
{
|
||||
if (use_cas && subop->retval == -EINTR)
|
||||
{
|
||||
// CAS failure - reread and repeat optimistically
|
||||
rwo->start = subop->offset - rwo->offset;
|
||||
rwo_read(rwo);
|
||||
delete subop;
|
||||
return;
|
||||
}
|
||||
fprintf(stderr, "error writing target at offset %lx: %s\n", subop->offset, strerror(-subop->retval));
|
||||
exit(1);
|
||||
}
|
||||
// Increment CAS version
|
||||
rwo->op.version++;
|
||||
if (use_cas)
|
||||
next_write(rwo);
|
||||
else
|
||||
autofree_op(rwo);
|
||||
delete subop;
|
||||
};
|
||||
parent->cli->execute(subop);
|
||||
}
|
||||
|
||||
void delete_offset(inode_t inode_num, uint64_t offset)
|
||||
{
|
||||
cluster_op_t *subop = new cluster_op_t;
|
||||
subop->opcode = OSD_OP_DELETE;
|
||||
subop->inode = inode_num;
|
||||
subop->offset = offset;
|
||||
subop->len = 0;
|
||||
subop->flags = OSD_OP_IGNORE_READONLY;
|
||||
subop->callback = [this](cluster_op_t *subop)
|
||||
{
|
||||
if (subop->retval != 0)
|
||||
{
|
||||
fprintf(stderr, "error deleting from layer 0x%lx at offset %lx: %s", subop->inode, subop->offset, strerror(-subop->retval));
|
||||
}
|
||||
delete subop;
|
||||
};
|
||||
parent->cli->execute(subop);
|
||||
}
|
||||
|
||||
void autofree_op(snap_rw_op_t *rwo)
|
||||
{
|
||||
if (!rwo->todo)
|
||||
{
|
||||
if (last_written_offset < rwo->op.offset+target_block_size)
|
||||
{
|
||||
last_written_offset = rwo->op.offset+target_block_size;
|
||||
}
|
||||
if (delete_source)
|
||||
{
|
||||
deleted_unsynced++;
|
||||
if (deleted_unsynced >= fsync_interval)
|
||||
{
|
||||
uint64_t from = last_fsync_offset, to = last_written_offset;
|
||||
cluster_op_t *subop = new cluster_op_t;
|
||||
subop->opcode = OSD_OP_SYNC;
|
||||
subop->callback = [this, from, to](cluster_op_t *subop)
|
||||
{
|
||||
delete subop;
|
||||
// We can now delete source data between <from> and <to>
|
||||
// But to do this we have to keep all object lists in memory :-(
|
||||
for (auto & lp: layer_list_pos)
|
||||
{
|
||||
auto & layer_list = layer_lists.at(lp.first);
|
||||
uint64_t layer_block = layer_block_size.at(lp.first);
|
||||
int cur_pos = lp.second;
|
||||
while (cur_pos < layer_list.size() && layer_list[cur_pos]+layer_block < to)
|
||||
{
|
||||
delete_offset(lp.first, layer_list[cur_pos]);
|
||||
cur_pos++;
|
||||
}
|
||||
lp.second = cur_pos;
|
||||
}
|
||||
};
|
||||
parent->cli->execute(subop);
|
||||
}
|
||||
}
|
||||
free(rwo->buf);
|
||||
delete rwo;
|
||||
in_flight--;
|
||||
continue_merge_reent();
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
std::function<bool(void)> cli_tool_t::start_merge(json11::Json cfg)
|
||||
{
|
||||
json11::Json::array cmd = cfg["command"].array_items();
|
||||
auto merger = new snap_merger_t();
|
||||
merger->parent = this;
|
||||
merger->from_name = cmd.size() > 1 ? cmd[1].string_value() : "";
|
||||
merger->to_name = cmd.size() > 2 ? cmd[2].string_value() : "";
|
||||
merger->target_name = cfg["target"].string_value();
|
||||
if (merger->from_name == "" || merger->to_name == "")
|
||||
{
|
||||
fprintf(stderr, "Beginning or end of the merge sequence is missing\n");
|
||||
exit(1);
|
||||
}
|
||||
merger->delete_source = cfg["delete-source"].string_value() != "";
|
||||
merger->fsync_interval = cfg["fsync-interval"].uint64_value();
|
||||
if (!merger->fsync_interval)
|
||||
merger->fsync_interval = 128;
|
||||
if (!cfg["cas"].is_null())
|
||||
merger->use_cas = cfg["cas"].uint64_value() ? 2 : 0;
|
||||
return [merger]()
|
||||
{
|
||||
merger->continue_merge_reent();
|
||||
if (merger->is_done())
|
||||
{
|
||||
delete merger;
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
};
|
||||
}
|
195
src/cli_rm.cpp
Normal file
195
src/cli_rm.cpp
Normal file
@@ -0,0 +1,195 @@
|
||||
// Copyright (c) Vitaliy Filippov, 2019+
|
||||
// License: VNPL-1.1 (see README.md for details)
|
||||
|
||||
#include "cli.h"
|
||||
#include "cluster_client.h"
|
||||
|
||||
#define RM_LISTING 1
|
||||
#define RM_REMOVING 2
|
||||
#define RM_END 3
|
||||
|
||||
struct rm_pg_t
|
||||
{
|
||||
pg_num_t pg_num;
|
||||
osd_num_t rm_osd_num;
|
||||
std::set<object_id> objects;
|
||||
std::set<object_id>::iterator obj_pos;
|
||||
uint64_t obj_count = 0, obj_done = 0, obj_prev_done = 0;
|
||||
int state = 0;
|
||||
int in_flight = 0;
|
||||
};
|
||||
|
||||
struct rm_inode_t
|
||||
{
|
||||
uint64_t inode = 0;
|
||||
pool_id_t pool_id = 0;
|
||||
|
||||
cli_tool_t *parent = NULL;
|
||||
inode_list_t *lister = NULL;
|
||||
std::vector<rm_pg_t*> lists;
|
||||
uint64_t total_count = 0, total_done = 0, total_prev_pct = 0;
|
||||
uint64_t pgs_to_list = 0;
|
||||
bool lists_done = false;
|
||||
int state = 0;
|
||||
|
||||
void start_delete()
|
||||
{
|
||||
lister = parent->cli->list_inode_start(inode, [this](inode_list_t *lst,
|
||||
std::set<object_id>&& objects, pg_num_t pg_num, osd_num_t primary_osd, int status)
|
||||
{
|
||||
rm_pg_t *rm = new rm_pg_t((rm_pg_t){
|
||||
.pg_num = pg_num,
|
||||
.rm_osd_num = primary_osd,
|
||||
.objects = objects,
|
||||
.obj_count = objects.size(),
|
||||
.obj_done = 0,
|
||||
.obj_prev_done = 0,
|
||||
});
|
||||
rm->obj_pos = rm->objects.begin();
|
||||
lists.push_back(rm);
|
||||
if (parent->list_first)
|
||||
{
|
||||
parent->cli->list_inode_next(lister, 1);
|
||||
}
|
||||
if (status & INODE_LIST_DONE)
|
||||
{
|
||||
lists_done = true;
|
||||
}
|
||||
pgs_to_list--;
|
||||
continue_delete();
|
||||
});
|
||||
if (!lister)
|
||||
{
|
||||
fprintf(stderr, "Failed to list inode %lu from pool %u objects\n", INODE_NO_POOL(inode), INODE_POOL(inode));
|
||||
exit(1);
|
||||
}
|
||||
pgs_to_list = parent->cli->list_pg_count(lister);
|
||||
parent->cli->list_inode_next(lister, parent->parallel_osds);
|
||||
}
|
||||
|
||||
void send_ops(rm_pg_t *cur_list)
|
||||
{
|
||||
if (parent->cli->msgr.osd_peer_fds.find(cur_list->rm_osd_num) ==
|
||||
parent->cli->msgr.osd_peer_fds.end())
|
||||
{
|
||||
// Initiate connection
|
||||
parent->cli->msgr.connect_peer(cur_list->rm_osd_num, parent->cli->st_cli.peer_states[cur_list->rm_osd_num]);
|
||||
return;
|
||||
}
|
||||
while (cur_list->in_flight < parent->iodepth && cur_list->obj_pos != cur_list->objects.end())
|
||||
{
|
||||
osd_op_t *op = new osd_op_t();
|
||||
op->op_type = OSD_OP_OUT;
|
||||
op->peer_fd = parent->cli->msgr.osd_peer_fds[cur_list->rm_osd_num];
|
||||
op->req = (osd_any_op_t){
|
||||
.rw = {
|
||||
.header = {
|
||||
.magic = SECONDARY_OSD_OP_MAGIC,
|
||||
.id = parent->cli->next_op_id(),
|
||||
.opcode = OSD_OP_DELETE,
|
||||
},
|
||||
.inode = cur_list->obj_pos->inode,
|
||||
.offset = cur_list->obj_pos->stripe,
|
||||
.len = 0,
|
||||
},
|
||||
};
|
||||
op->callback = [this, cur_list](osd_op_t *op)
|
||||
{
|
||||
cur_list->in_flight--;
|
||||
if (op->reply.hdr.retval < 0)
|
||||
{
|
||||
fprintf(stderr, "Failed to remove object %lx:%lx from PG %u (OSD %lu) (retval=%ld)\n",
|
||||
op->req.rw.inode, op->req.rw.offset,
|
||||
cur_list->pg_num, cur_list->rm_osd_num, op->reply.hdr.retval);
|
||||
}
|
||||
delete op;
|
||||
cur_list->obj_done++;
|
||||
total_done++;
|
||||
continue_delete();
|
||||
};
|
||||
cur_list->obj_pos++;
|
||||
cur_list->in_flight++;
|
||||
parent->cli->msgr.outbox_push(op);
|
||||
}
|
||||
}
|
||||
|
||||
void continue_delete()
|
||||
{
|
||||
if (parent->list_first && !lists_done)
|
||||
{
|
||||
return;
|
||||
}
|
||||
for (int i = 0; i < lists.size(); i++)
|
||||
{
|
||||
if (!lists[i]->in_flight && lists[i]->obj_pos == lists[i]->objects.end())
|
||||
{
|
||||
delete lists[i];
|
||||
lists.erase(lists.begin()+i, lists.begin()+i+1);
|
||||
i--;
|
||||
if (!lists_done)
|
||||
{
|
||||
parent->cli->list_inode_next(lister, 1);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
send_ops(lists[i]);
|
||||
}
|
||||
}
|
||||
if (parent->progress && total_count > 0 && total_done*1000/total_count != total_prev_pct)
|
||||
{
|
||||
printf("\rRemoved %lu/%lu objects, %lu more PGs to list...", total_done, total_count, pgs_to_list);
|
||||
total_prev_pct = total_done*1000/total_count;
|
||||
}
|
||||
if (lists_done && !lists.size())
|
||||
{
|
||||
printf("Done, inode %lu in pool %u data removed\n", INODE_NO_POOL(inode), pool_id);
|
||||
state = 2;
|
||||
}
|
||||
}
|
||||
|
||||
bool loop()
|
||||
{
|
||||
if (state == 0)
|
||||
{
|
||||
start_delete();
|
||||
state = 1;
|
||||
}
|
||||
else if (state == 1)
|
||||
{
|
||||
continue_delete();
|
||||
}
|
||||
else if (state == 2)
|
||||
{
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
};
|
||||
|
||||
std::function<bool(void)> cli_tool_t::start_rm(json11::Json cfg)
|
||||
{
|
||||
auto remover = new rm_inode_t();
|
||||
remover->parent = this;
|
||||
remover->inode = cfg["inode"].uint64_value();
|
||||
remover->pool_id = cfg["pool"].uint64_value();
|
||||
if (remover->pool_id)
|
||||
{
|
||||
remover->inode = (remover->inode & ((1l << (64-POOL_ID_BITS)) - 1)) | (((uint64_t)remover->pool_id) << (64-POOL_ID_BITS));
|
||||
}
|
||||
remover->pool_id = INODE_POOL(remover->inode);
|
||||
if (!remover->pool_id)
|
||||
{
|
||||
fprintf(stderr, "pool is missing\n");
|
||||
exit(1);
|
||||
}
|
||||
return [remover]()
|
||||
{
|
||||
if (remover->loop())
|
||||
{
|
||||
delete remover;
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
};
|
||||
}
|
565
src/cli_snap_rm.cpp
Normal file
565
src/cli_snap_rm.cpp
Normal file
@@ -0,0 +1,565 @@
|
||||
// Copyright (c) Vitaliy Filippov, 2019+
|
||||
// License: VNPL-1.1 (see README.md for details)
|
||||
|
||||
#include "cli.h"
|
||||
#include "cluster_client.h"
|
||||
#include "base64.h"
|
||||
|
||||
// Remove layer(s): similar to merge, but alters metadata and processes multiple merge targets
|
||||
//
|
||||
// Exactly one child of the requested layers may be merged using the "inverted" workflow,
|
||||
// where we merge it "down" into one of the "to-be-removed" layers and then rename the
|
||||
// "to-be-removed" layer to the child. It may be done either if all writers are stopped
|
||||
// before trying to delete layers (which is signaled by --writers-stopped) or if that child
|
||||
// is a read-only layer (snapshot) itself.
|
||||
//
|
||||
// This "inverted" workflow trades copying data of one of the deleted layers for copying
|
||||
// data of one child of the chain which is also a child of the "traded" layer. So we
|
||||
// choose the (parent,child) pair which has the largest difference between "parent" and
|
||||
// "child" inode sizes.
|
||||
//
|
||||
// All other children of the chain are processed by iterating though them, merging removed
|
||||
// parents into them and rebasing them to the last layer which isn't a member of the removed
|
||||
// chain.
|
||||
//
|
||||
// Example:
|
||||
//
|
||||
// <parent> - <from> - <layer 2> - <to> - <child 1>
|
||||
// \ \ \- <child 2>
|
||||
// \ \- <child 3>
|
||||
// \-<child 4>
|
||||
//
|
||||
// 1) Find optimal pair for the "reverse" scenario
|
||||
// Imagine that it's (<layer 2>, <child 1>) in this example
|
||||
// 2) Process all children except <child 1>:
|
||||
// - Merge <from>..<to> to <child 2>
|
||||
// - Set <child 2> parent to <parent>
|
||||
// - Repeat for others
|
||||
// 3) Process <child 1>:
|
||||
// - Merge <from>..<child 1> to <layer 2>
|
||||
// - Set <layer 2> parent to <parent>
|
||||
// - Rename <layer 2> to <child 1>
|
||||
// 4) Delete other layers of the chain (<from>, <to>)
|
||||
struct snap_remover_t
|
||||
{
|
||||
cli_tool_t *parent;
|
||||
|
||||
// remove from..to
|
||||
std::string from_name, to_name;
|
||||
// writers are stopped, we can safely change writable layers
|
||||
bool writers_stopped = false;
|
||||
// use CAS writes (0 = never, 1 = auto, 2 = always)
|
||||
int use_cas = 1;
|
||||
// interval between fsyncs
|
||||
int fsync_interval = 128;
|
||||
|
||||
std::map<inode_t,int> sources;
|
||||
std::map<inode_t,uint64_t> inode_used;
|
||||
std::vector<inode_t> merge_children;
|
||||
std::vector<inode_t> chain_list;
|
||||
std::map<inode_t,int> inverse_candidates;
|
||||
inode_t inverse_parent = 0, inverse_child = 0;
|
||||
inode_t new_parent = 0;
|
||||
int state = 0;
|
||||
int current_child = 0;
|
||||
std::function<bool(void)> cb;
|
||||
|
||||
bool is_done()
|
||||
{
|
||||
return state == 9;
|
||||
}
|
||||
|
||||
void loop()
|
||||
{
|
||||
if (state == 1)
|
||||
goto resume_1;
|
||||
else if (state == 2)
|
||||
goto resume_2;
|
||||
else if (state == 3)
|
||||
goto resume_3;
|
||||
else if (state == 4)
|
||||
goto resume_4;
|
||||
else if (state == 5)
|
||||
goto resume_5;
|
||||
else if (state == 6)
|
||||
goto resume_6;
|
||||
else if (state == 7)
|
||||
goto resume_7;
|
||||
else if (state == 8)
|
||||
goto resume_8;
|
||||
else if (state == 9)
|
||||
goto resume_9;
|
||||
// Get children to merge
|
||||
get_merge_children();
|
||||
// Try to select an inode for the "inverse" optimized scenario
|
||||
// Read statistics from etcd to do it
|
||||
read_stats();
|
||||
state = 1;
|
||||
resume_1:
|
||||
if (parent->waiting > 0)
|
||||
return;
|
||||
choose_inverse_candidate();
|
||||
// Merge children one by one, except our "inverse" child
|
||||
for (current_child = 0; current_child < merge_children.size(); current_child++)
|
||||
{
|
||||
if (merge_children[current_child] == inverse_child)
|
||||
continue;
|
||||
start_merge_child(merge_children[current_child], merge_children[current_child]);
|
||||
resume_2:
|
||||
while (!cb())
|
||||
{
|
||||
state = 2;
|
||||
return;
|
||||
}
|
||||
cb = NULL;
|
||||
parent->change_parent(merge_children[current_child], new_parent);
|
||||
state = 3;
|
||||
resume_3:
|
||||
if (parent->waiting > 0)
|
||||
return;
|
||||
}
|
||||
// Merge our "inverse" child into our "inverse" parent
|
||||
if (inverse_child != 0)
|
||||
{
|
||||
start_merge_child(inverse_child, inverse_parent);
|
||||
resume_4:
|
||||
while (!cb())
|
||||
{
|
||||
state = 4;
|
||||
return;
|
||||
}
|
||||
cb = NULL;
|
||||
// Delete "inverse" child data
|
||||
start_delete_source(inverse_child);
|
||||
resume_5:
|
||||
while (!cb())
|
||||
{
|
||||
state = 5;
|
||||
return;
|
||||
}
|
||||
cb = NULL;
|
||||
// Delete "inverse" child metadata, rename parent over it,
|
||||
// and also change parent links of the previous "inverse" child
|
||||
rename_inverse_parent();
|
||||
state = 6;
|
||||
resume_6:
|
||||
if (parent->waiting > 0)
|
||||
return;
|
||||
}
|
||||
// Delete parents, except the "inverse" one
|
||||
for (current_child = 0; current_child < chain_list.size(); current_child++)
|
||||
{
|
||||
if (chain_list[current_child] == inverse_parent)
|
||||
continue;
|
||||
start_delete_source(chain_list[current_child]);
|
||||
resume_7:
|
||||
while (!cb())
|
||||
{
|
||||
state = 7;
|
||||
return;
|
||||
}
|
||||
cb = NULL;
|
||||
delete_inode_config(chain_list[current_child]);
|
||||
state = 8;
|
||||
resume_8:
|
||||
if (parent->waiting > 0)
|
||||
return;
|
||||
}
|
||||
state = 9;
|
||||
resume_9:
|
||||
// Done
|
||||
return;
|
||||
}
|
||||
|
||||
void get_merge_children()
|
||||
{
|
||||
// Get all children of from..to
|
||||
inode_config_t *from_cfg = parent->get_inode_cfg(from_name);
|
||||
inode_config_t *to_cfg = parent->get_inode_cfg(to_name);
|
||||
// Check that to_cfg is actually a child of from_cfg
|
||||
// FIXME de-copypaste the following piece of code with snap_merger_t
|
||||
inode_config_t *cur = to_cfg;
|
||||
chain_list.push_back(cur->num);
|
||||
while (cur->num != from_cfg->num && cur->parent_id != 0)
|
||||
{
|
||||
auto it = parent->cli->st_cli.inode_config.find(cur->parent_id);
|
||||
if (it == parent->cli->st_cli.inode_config.end())
|
||||
{
|
||||
fprintf(stderr, "Parent inode of layer %s (id %ld) not found\n", cur->name.c_str(), cur->parent_id);
|
||||
exit(1);
|
||||
}
|
||||
cur = &it->second;
|
||||
chain_list.push_back(cur->num);
|
||||
}
|
||||
if (cur->num != from_cfg->num)
|
||||
{
|
||||
fprintf(stderr, "Layer %s is not a child of %s\n", to_name.c_str(), from_name.c_str());
|
||||
exit(1);
|
||||
}
|
||||
new_parent = from_cfg->parent_id;
|
||||
// Calculate ranks
|
||||
int i = chain_list.size()-1;
|
||||
for (inode_t item: chain_list)
|
||||
{
|
||||
sources[item] = i--;
|
||||
}
|
||||
for (auto & ic: parent->cli->st_cli.inode_config)
|
||||
{
|
||||
if (!ic.second.parent_id)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
auto it = sources.find(ic.second.parent_id);
|
||||
if (it != sources.end() && sources.find(ic.second.num) == sources.end())
|
||||
{
|
||||
merge_children.push_back(ic.second.num);
|
||||
if (ic.second.readonly || writers_stopped)
|
||||
{
|
||||
inverse_candidates[ic.second.num] = it->second;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void read_stats()
|
||||
{
|
||||
if (inverse_candidates.size() == 0)
|
||||
{
|
||||
return;
|
||||
}
|
||||
json11::Json::array reads;
|
||||
for (auto cp: inverse_candidates)
|
||||
{
|
||||
inode_t inode = cp.first;
|
||||
reads.push_back(json11::Json::object {
|
||||
{ "request_range", json11::Json::object {
|
||||
{ "key", base64_encode(
|
||||
parent->cli->st_cli.etcd_prefix+
|
||||
"/inode/stats/"+std::to_string(INODE_POOL(inode))+
|
||||
"/"+std::to_string(INODE_NO_POOL(inode))
|
||||
) },
|
||||
} }
|
||||
});
|
||||
}
|
||||
for (auto cp: sources)
|
||||
{
|
||||
inode_t inode = cp.first;
|
||||
reads.push_back(json11::Json::object {
|
||||
{ "request_range", json11::Json::object {
|
||||
{ "key", base64_encode(
|
||||
parent->cli->st_cli.etcd_prefix+
|
||||
"/inode/stats/"+std::to_string(INODE_POOL(inode))+
|
||||
"/"+std::to_string(INODE_NO_POOL(inode))
|
||||
) },
|
||||
} }
|
||||
});
|
||||
}
|
||||
parent->waiting++;
|
||||
parent->cli->st_cli.etcd_txn(json11::Json::object {
|
||||
{ "success", reads },
|
||||
}, ETCD_SLOW_TIMEOUT, [this](std::string err, json11::Json data)
|
||||
{
|
||||
parent->waiting--;
|
||||
if (err != "")
|
||||
{
|
||||
fprintf(stderr, "Error reading layer statistics from etcd: %s\n", err.c_str());
|
||||
exit(1);
|
||||
}
|
||||
for (auto inode_result: data["responses"].array_items())
|
||||
{
|
||||
auto kv = parent->cli->st_cli.parse_etcd_kv(inode_result["kvs"][0]);
|
||||
pool_id_t pool_id = 0;
|
||||
inode_t inode = 0;
|
||||
char null_byte = 0;
|
||||
sscanf(kv.key.c_str() + parent->cli->st_cli.etcd_prefix.length()+13, "%u/%lu%c", &pool_id, &inode, &null_byte);
|
||||
if (!inode || null_byte != 0)
|
||||
{
|
||||
fprintf(stderr, "Bad key returned from etcd: %s\n", kv.key.c_str());
|
||||
exit(1);
|
||||
}
|
||||
auto pool_cfg_it = parent->cli->st_cli.pool_config.find(pool_id);
|
||||
if (pool_cfg_it == parent->cli->st_cli.pool_config.end())
|
||||
{
|
||||
fprintf(stderr, "Pool %u does not exist\n", pool_id);
|
||||
exit(1);
|
||||
}
|
||||
inode = INODE_WITH_POOL(pool_id, inode);
|
||||
auto & pool_cfg = pool_cfg_it->second;
|
||||
uint64_t used_bytes = kv.value["raw_used"].uint64_value() / pool_cfg.pg_size;
|
||||
if (pool_cfg.scheme != POOL_SCHEME_REPLICATED)
|
||||
{
|
||||
used_bytes *= (pool_cfg.pg_size - pool_cfg.parity_chunks);
|
||||
}
|
||||
inode_used[inode] = used_bytes;
|
||||
}
|
||||
parent->ringloop->wakeup();
|
||||
});
|
||||
}
|
||||
|
||||
void choose_inverse_candidate()
|
||||
{
|
||||
uint64_t max_diff = 0;
|
||||
for (auto cp: inverse_candidates)
|
||||
{
|
||||
inode_t child = cp.first;
|
||||
uint64_t child_used = inode_used[child];
|
||||
int rank = cp.second;
|
||||
for (int i = chain_list.size()-rank; i < chain_list.size(); i++)
|
||||
{
|
||||
inode_t parent = chain_list[i];
|
||||
uint64_t parent_used = inode_used[parent];
|
||||
if (parent_used > child_used && (!max_diff || max_diff < (parent_used-child_used)))
|
||||
{
|
||||
max_diff = (parent_used-child_used);
|
||||
inverse_parent = parent;
|
||||
inverse_child = child;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void rename_inverse_parent()
|
||||
{
|
||||
auto child_it = parent->cli->st_cli.inode_config.find(inverse_child);
|
||||
if (child_it == parent->cli->st_cli.inode_config.end())
|
||||
{
|
||||
fprintf(stderr, "Inode %ld disappeared\n", inverse_child);
|
||||
exit(1);
|
||||
}
|
||||
auto target_it = parent->cli->st_cli.inode_config.find(inverse_parent);
|
||||
if (target_it == parent->cli->st_cli.inode_config.end())
|
||||
{
|
||||
fprintf(stderr, "Inode %ld disappeared\n", inverse_parent);
|
||||
exit(1);
|
||||
}
|
||||
inode_config_t *child_cfg = &child_it->second;
|
||||
inode_config_t *target_cfg = &target_it->second;
|
||||
std::string child_name = child_cfg->name;
|
||||
std::string target_name = target_cfg->name;
|
||||
std::string child_cfg_key = base64_encode(
|
||||
parent->cli->st_cli.etcd_prefix+
|
||||
"/config/inode/"+std::to_string(INODE_POOL(inverse_child))+
|
||||
"/"+std::to_string(INODE_NO_POOL(inverse_child))
|
||||
);
|
||||
std::string target_cfg_key = base64_encode(
|
||||
parent->cli->st_cli.etcd_prefix+
|
||||
"/config/inode/"+std::to_string(INODE_POOL(inverse_parent))+
|
||||
"/"+std::to_string(INODE_NO_POOL(inverse_parent))
|
||||
);
|
||||
// Fill new configuration
|
||||
inode_config_t new_cfg = *child_cfg;
|
||||
new_cfg.num = target_cfg->num;
|
||||
new_cfg.parent_id = new_parent;
|
||||
json11::Json::array cmp = json11::Json::array {
|
||||
json11::Json::object {
|
||||
{ "target", "MOD" },
|
||||
{ "key", child_cfg_key },
|
||||
{ "result", "LESS" },
|
||||
{ "mod_revision", child_cfg->mod_revision+1 },
|
||||
},
|
||||
json11::Json::object {
|
||||
{ "target", "MOD" },
|
||||
{ "key", target_cfg_key },
|
||||
{ "result", "LESS" },
|
||||
{ "mod_revision", target_cfg->mod_revision+1 },
|
||||
},
|
||||
};
|
||||
json11::Json::array txn = json11::Json::array {
|
||||
json11::Json::object {
|
||||
{ "request_delete_range", json11::Json::object {
|
||||
{ "key", child_cfg_key },
|
||||
} },
|
||||
},
|
||||
json11::Json::object {
|
||||
{ "request_put", json11::Json::object {
|
||||
{ "key", target_cfg_key },
|
||||
{ "value", base64_encode(json11::Json(parent->cli->st_cli.serialize_inode_cfg(&new_cfg)).dump()) },
|
||||
} },
|
||||
},
|
||||
json11::Json::object {
|
||||
{ "request_put", json11::Json::object {
|
||||
{ "key", base64_encode(parent->cli->st_cli.etcd_prefix+"/index/image/"+child_cfg->name) },
|
||||
{ "value", base64_encode(json11::Json({
|
||||
{ "id", INODE_NO_POOL(inverse_parent) },
|
||||
{ "pool_id", (uint64_t)INODE_POOL(inverse_parent) },
|
||||
}).dump()) },
|
||||
} },
|
||||
},
|
||||
};
|
||||
// Reparent children of inverse_child
|
||||
for (auto & cp: parent->cli->st_cli.inode_config)
|
||||
{
|
||||
if (cp.second.parent_id == child_cfg->num)
|
||||
{
|
||||
auto cp_cfg = cp.second;
|
||||
cp_cfg.parent_id = inverse_parent;
|
||||
auto cp_key = base64_encode(
|
||||
parent->cli->st_cli.etcd_prefix+
|
||||
"/config/inode/"+std::to_string(INODE_POOL(cp.second.num))+
|
||||
"/"+std::to_string(INODE_NO_POOL(cp.second.num))
|
||||
);
|
||||
cmp.push_back(json11::Json::object {
|
||||
{ "target", "MOD" },
|
||||
{ "key", cp_key },
|
||||
{ "result", "LESS" },
|
||||
{ "mod_revision", cp.second.mod_revision+1 },
|
||||
});
|
||||
txn.push_back(json11::Json::object {
|
||||
{ "request_put", json11::Json::object {
|
||||
{ "key", cp_key },
|
||||
{ "value", base64_encode(json11::Json(parent->cli->st_cli.serialize_inode_cfg(&cp_cfg)).dump()) },
|
||||
} },
|
||||
});
|
||||
}
|
||||
}
|
||||
parent->waiting++;
|
||||
parent->cli->st_cli.etcd_txn(json11::Json::object {
|
||||
{ "compare", cmp },
|
||||
{ "success", txn },
|
||||
}, ETCD_SLOW_TIMEOUT, [this, target_name, child_name](std::string err, json11::Json res)
|
||||
{
|
||||
parent->waiting--;
|
||||
if (err != "")
|
||||
{
|
||||
fprintf(stderr, "Error renaming %s to %s: %s\n", target_name.c_str(), child_name.c_str(), err.c_str());
|
||||
exit(1);
|
||||
}
|
||||
if (!res["succeeded"].bool_value())
|
||||
{
|
||||
fprintf(
|
||||
stderr, "Parent (%s), child (%s), or one of its children"
|
||||
" configuration was modified during rename\n", target_name.c_str(), child_name.c_str()
|
||||
);
|
||||
exit(1);
|
||||
}
|
||||
printf("Layer %s renamed to %s\n", target_name.c_str(), child_name.c_str());
|
||||
parent->ringloop->wakeup();
|
||||
});
|
||||
}
|
||||
|
||||
void delete_inode_config(inode_t cur)
|
||||
{
|
||||
auto cur_cfg_it = parent->cli->st_cli.inode_config.find(cur);
|
||||
if (cur_cfg_it == parent->cli->st_cli.inode_config.end())
|
||||
{
|
||||
fprintf(stderr, "Inode 0x%lx disappeared\n", cur);
|
||||
exit(1);
|
||||
}
|
||||
inode_config_t *cur_cfg = &cur_cfg_it->second;
|
||||
std::string cur_name = cur_cfg->name;
|
||||
std::string cur_cfg_key = base64_encode(
|
||||
parent->cli->st_cli.etcd_prefix+
|
||||
"/config/inode/"+std::to_string(INODE_POOL(cur))+
|
||||
"/"+std::to_string(INODE_NO_POOL(cur))
|
||||
);
|
||||
parent->waiting++;
|
||||
parent->cli->st_cli.etcd_txn(json11::Json::object {
|
||||
{ "compare", json11::Json::array {
|
||||
json11::Json::object {
|
||||
{ "target", "MOD" },
|
||||
{ "key", cur_cfg_key },
|
||||
{ "result", "LESS" },
|
||||
{ "mod_revision", cur_cfg->mod_revision+1 },
|
||||
},
|
||||
} },
|
||||
{ "success", json11::Json::array {
|
||||
json11::Json::object {
|
||||
{ "request_delete_range", json11::Json::object {
|
||||
{ "key", cur_cfg_key },
|
||||
} },
|
||||
{ "request_delete_range", json11::Json::object {
|
||||
{ "key", base64_encode(parent->cli->st_cli.etcd_prefix+"/index/image/"+cur_name) },
|
||||
} },
|
||||
},
|
||||
} },
|
||||
}, ETCD_SLOW_TIMEOUT, [this, cur_name](std::string err, json11::Json res)
|
||||
{
|
||||
parent->waiting--;
|
||||
if (err != "")
|
||||
{
|
||||
fprintf(stderr, "Error deleting %s: %s\n", cur_name.c_str(), err.c_str());
|
||||
exit(1);
|
||||
}
|
||||
if (!res["succeeded"].bool_value())
|
||||
{
|
||||
fprintf(stderr, "Layer %s configuration was modified during deletion\n", cur_name.c_str());
|
||||
exit(1);
|
||||
}
|
||||
printf("Layer %s deleted\n", cur_name.c_str());
|
||||
parent->ringloop->wakeup();
|
||||
});
|
||||
}
|
||||
|
||||
void start_merge_child(inode_t child_inode, inode_t target_inode)
|
||||
{
|
||||
auto child_it = parent->cli->st_cli.inode_config.find(child_inode);
|
||||
if (child_it == parent->cli->st_cli.inode_config.end())
|
||||
{
|
||||
fprintf(stderr, "Inode %ld disappeared\n", child_inode);
|
||||
exit(1);
|
||||
}
|
||||
auto target_it = parent->cli->st_cli.inode_config.find(target_inode);
|
||||
if (target_it == parent->cli->st_cli.inode_config.end())
|
||||
{
|
||||
fprintf(stderr, "Inode %ld disappeared\n", target_inode);
|
||||
exit(1);
|
||||
}
|
||||
cb = parent->start_merge(json11::Json::object {
|
||||
{ "command", json11::Json::array{ "merge-data", from_name, child_it->second.name } },
|
||||
{ "target", target_it->second.name },
|
||||
{ "delete-source", false },
|
||||
{ "cas", use_cas },
|
||||
{ "fsync-interval", fsync_interval },
|
||||
});
|
||||
}
|
||||
|
||||
void start_delete_source(inode_t inode)
|
||||
{
|
||||
auto source = parent->cli->st_cli.inode_config.find(inode);
|
||||
if (source == parent->cli->st_cli.inode_config.end())
|
||||
{
|
||||
fprintf(stderr, "Inode %ld disappeared\n", inode);
|
||||
exit(1);
|
||||
}
|
||||
cb = parent->start_rm(json11::Json::object {
|
||||
{ "inode", inode },
|
||||
{ "pool", (uint64_t)INODE_POOL(inode) },
|
||||
{ "fsync-interval", fsync_interval },
|
||||
});
|
||||
}
|
||||
};
|
||||
|
||||
std::function<bool(void)> cli_tool_t::start_snap_rm(json11::Json cfg)
|
||||
{
|
||||
json11::Json::array cmd = cfg["command"].array_items();
|
||||
auto snap_remover = new snap_remover_t();
|
||||
snap_remover->parent = this;
|
||||
snap_remover->from_name = cmd.size() > 1 ? cmd[1].string_value() : "";
|
||||
snap_remover->to_name = cmd.size() > 2 ? cmd[2].string_value() : "";
|
||||
if (snap_remover->from_name == "")
|
||||
{
|
||||
fprintf(stderr, "Layer to remove argument is missing\n");
|
||||
exit(1);
|
||||
}
|
||||
if (snap_remover->to_name == "")
|
||||
{
|
||||
snap_remover->to_name = snap_remover->from_name;
|
||||
}
|
||||
snap_remover->fsync_interval = cfg["fsync-interval"].uint64_value();
|
||||
if (!snap_remover->fsync_interval)
|
||||
snap_remover->fsync_interval = 128;
|
||||
if (!cfg["cas"].is_null())
|
||||
snap_remover->use_cas = cfg["cas"].uint64_value() ? 2 : 0;
|
||||
if (!cfg["writers_stopped"].is_null())
|
||||
snap_remover->writers_stopped = true;
|
||||
return [snap_remover]()
|
||||
{
|
||||
snap_remover->loop();
|
||||
if (snap_remover->is_done())
|
||||
{
|
||||
delete snap_remover;
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
};
|
||||
}
|
@@ -12,7 +12,7 @@
|
||||
#define CACHE_DIRTY 1
|
||||
#define CACHE_FLUSHING 2
|
||||
#define CACHE_REPEATING 3
|
||||
#define OP_FLUSH_BUFFER 2
|
||||
#define OP_FLUSH_BUFFER 0x02
|
||||
|
||||
cluster_client_t::cluster_client_t(ring_loop_t *ringloop, timerfd_manager_t *tfd, json11::Json & config)
|
||||
{
|
||||
@@ -31,6 +31,7 @@ cluster_client_t::cluster_client_t(ring_loop_t *ringloop, timerfd_manager_t *tfd
|
||||
{
|
||||
// peer_osd just connected
|
||||
continue_ops();
|
||||
continue_lists();
|
||||
}
|
||||
else if (dirty_buffers.size())
|
||||
{
|
||||
@@ -139,7 +140,7 @@ void cluster_client_t::calc_wait(cluster_op_t *op)
|
||||
if (!op->prev_wait && pgs_loaded)
|
||||
continue_sync(op);
|
||||
}
|
||||
else
|
||||
else /* if (op->opcode == OSD_OP_READ || op->opcode == OSD_OP_READ_BITMAP) */
|
||||
{
|
||||
for (auto prev = op->prev; prev; prev = prev->prev)
|
||||
{
|
||||
@@ -147,7 +148,7 @@ void cluster_client_t::calc_wait(cluster_op_t *op)
|
||||
{
|
||||
op->prev_wait++;
|
||||
}
|
||||
else if (prev->opcode == OSD_OP_WRITE || prev->opcode == OSD_OP_READ)
|
||||
else if (prev->opcode == OSD_OP_WRITE || prev->opcode == OSD_OP_READ || prev->opcode == OSD_OP_READ_BITMAP)
|
||||
{
|
||||
// Flushes are always in the beginning
|
||||
break;
|
||||
@@ -167,7 +168,7 @@ void cluster_client_t::inc_wait(uint64_t opcode, uint64_t flags, cluster_op_t *n
|
||||
auto n2 = next->next;
|
||||
if (next->opcode == OSD_OP_SYNC ||
|
||||
next->opcode == OSD_OP_WRITE && (flags & OP_FLUSH_BUFFER) && !(next->flags & OP_FLUSH_BUFFER) ||
|
||||
next->opcode == OSD_OP_READ && (flags & OP_FLUSH_BUFFER))
|
||||
(next->opcode == OSD_OP_READ || next->opcode == OSD_OP_READ_BITMAP) && (flags & OP_FLUSH_BUFFER))
|
||||
{
|
||||
next->prev_wait += inc;
|
||||
if (!next->prev_wait)
|
||||
@@ -357,7 +358,7 @@ void cluster_client_t::on_change_hook(std::map<std::string, etcd_kv_t> & changes
|
||||
// And now they have to be resliced!
|
||||
for (auto op = op_queue_head; op; op = op->next)
|
||||
{
|
||||
if ((op->opcode == OSD_OP_WRITE || op->opcode == OSD_OP_READ) &&
|
||||
if ((op->opcode == OSD_OP_WRITE || op->opcode == OSD_OP_READ || op->opcode == OSD_OP_READ_BITMAP) &&
|
||||
INODE_POOL(op->cur_inode) == pool_item.first)
|
||||
{
|
||||
op->needs_reslice = true;
|
||||
@@ -417,7 +418,8 @@ void cluster_client_t::on_ready(std::function<void(void)> fn)
|
||||
*/
|
||||
void cluster_client_t::execute(cluster_op_t *op)
|
||||
{
|
||||
if (op->opcode != OSD_OP_SYNC && op->opcode != OSD_OP_READ && op->opcode != OSD_OP_WRITE)
|
||||
if (op->opcode != OSD_OP_SYNC && op->opcode != OSD_OP_READ &&
|
||||
op->opcode != OSD_OP_READ_BITMAP && op->opcode != OSD_OP_WRITE)
|
||||
{
|
||||
op->retval = -EINVAL;
|
||||
std::function<void(cluster_op_t*)>(op->callback)(op);
|
||||
@@ -557,7 +559,7 @@ void cluster_client_t::flush_buffer(const object_id & oid, cluster_buffer_t *wr)
|
||||
{
|
||||
wr->state = CACHE_REPEATING;
|
||||
cluster_op_t *op = new cluster_op_t;
|
||||
op->flags = OP_FLUSH_BUFFER;
|
||||
op->flags = OSD_OP_IGNORE_READONLY|OP_FLUSH_BUFFER;
|
||||
op->opcode = OSD_OP_WRITE;
|
||||
op->cur_inode = op->inode = oid.inode;
|
||||
op->offset = oid.stripe;
|
||||
@@ -594,7 +596,8 @@ int cluster_client_t::continue_rw(cluster_op_t *op)
|
||||
else if (op->state == 3)
|
||||
goto resume_3;
|
||||
resume_0:
|
||||
if (!op->len || op->offset % bs_bitmap_granularity || op->len % bs_bitmap_granularity)
|
||||
if ((op->opcode == OSD_OP_READ || op->opcode == OSD_OP_WRITE) && !op->len ||
|
||||
op->offset % bs_bitmap_granularity || op->len % bs_bitmap_granularity)
|
||||
{
|
||||
op->retval = -EINVAL;
|
||||
erase_op(op);
|
||||
@@ -615,16 +618,19 @@ resume_0:
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
if (op->opcode == OSD_OP_WRITE)
|
||||
if (op->opcode == OSD_OP_WRITE || op->opcode == OSD_OP_DELETE)
|
||||
{
|
||||
auto ino_it = st_cli.inode_config.find(op->inode);
|
||||
if (ino_it != st_cli.inode_config.end() && ino_it->second.readonly)
|
||||
if (!(op->flags & OSD_OP_IGNORE_READONLY))
|
||||
{
|
||||
op->retval = -EINVAL;
|
||||
erase_op(op);
|
||||
return 1;
|
||||
auto ino_it = st_cli.inode_config.find(op->inode);
|
||||
if (ino_it != st_cli.inode_config.end() && ino_it->second.readonly)
|
||||
{
|
||||
op->retval = -EINVAL;
|
||||
erase_op(op);
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
if (!immediate_commit && !(op->flags & OP_FLUSH_BUFFER))
|
||||
if (op->opcode == OSD_OP_WRITE && !immediate_commit && !(op->flags & OP_FLUSH_BUFFER))
|
||||
{
|
||||
copy_write(op, dirty_buffers);
|
||||
}
|
||||
@@ -633,7 +639,7 @@ resume_1:
|
||||
// Slice the operation into parts
|
||||
slice_rw(op);
|
||||
op->needs_reslice = false;
|
||||
if (op->opcode == OSD_OP_WRITE && op->version && op->parts.size() > 1)
|
||||
if ((op->opcode == OSD_OP_WRITE || op->opcode == OSD_OP_DELETE) && op->version && op->parts.size() > 1)
|
||||
{
|
||||
// Atomic writes to multiple stripes are unsupported
|
||||
op->retval = -EINVAL;
|
||||
@@ -793,13 +799,13 @@ void cluster_client_t::slice_rw(cluster_op_t *op)
|
||||
uint32_t pg_data_size = (pool_cfg.scheme == POOL_SCHEME_REPLICATED ? 1 : pool_cfg.pg_size-pool_cfg.parity_chunks);
|
||||
uint64_t pg_block_size = bs_block_size * pg_data_size;
|
||||
uint64_t first_stripe = (op->offset / pg_block_size) * pg_block_size;
|
||||
uint64_t last_stripe = ((op->offset + op->len + pg_block_size - 1) / pg_block_size - 1) * pg_block_size;
|
||||
uint64_t last_stripe = op->len > 0 ? ((op->offset + op->len - 1) / pg_block_size) * pg_block_size : first_stripe;
|
||||
op->retval = 0;
|
||||
op->parts.resize((last_stripe - first_stripe) / pg_block_size + 1);
|
||||
if (op->opcode == OSD_OP_READ)
|
||||
if (op->opcode == OSD_OP_READ || op->opcode == OSD_OP_READ_BITMAP)
|
||||
{
|
||||
// Allocate memory for the bitmap
|
||||
unsigned object_bitmap_size = ((op->len / bs_bitmap_granularity + 7) / 8);
|
||||
unsigned object_bitmap_size = (((op->opcode == OSD_OP_READ_BITMAP ? pg_block_size : op->len) / bs_bitmap_granularity + 7) / 8);
|
||||
object_bitmap_size = (object_bitmap_size < 8 ? 8 : object_bitmap_size);
|
||||
unsigned bitmap_mem = object_bitmap_size + (bs_bitmap_size * pg_data_size) * op->parts.size();
|
||||
if (op->bitmap_buf_size < bitmap_mem)
|
||||
@@ -863,13 +869,13 @@ void cluster_client_t::slice_rw(cluster_op_t *op)
|
||||
if (end == begin)
|
||||
op->done_count++;
|
||||
}
|
||||
else
|
||||
else if (op->opcode != OSD_OP_READ_BITMAP && op->opcode != OSD_OP_DELETE)
|
||||
{
|
||||
add_iov(end-begin, false, op, iov_idx, iov_pos, op->parts[i].iov, NULL, 0);
|
||||
}
|
||||
op->parts[i].parent = op;
|
||||
op->parts[i].offset = begin;
|
||||
op->parts[i].len = (uint32_t)(end - begin);
|
||||
op->parts[i].len = op->opcode == OSD_OP_READ_BITMAP || op->opcode == OSD_OP_DELETE ? 0 : (uint32_t)(end - begin);
|
||||
op->parts[i].pg_num = pg_num;
|
||||
op->parts[i].osd_num = 0;
|
||||
op->parts[i].flags = 0;
|
||||
@@ -883,7 +889,7 @@ bool cluster_client_t::affects_osd(uint64_t inode, uint64_t offset, uint64_t len
|
||||
uint32_t pg_data_size = (pool_cfg.scheme == POOL_SCHEME_REPLICATED ? 1 : pool_cfg.pg_size-pool_cfg.parity_chunks);
|
||||
uint64_t pg_block_size = bs_block_size * pg_data_size;
|
||||
uint64_t first_stripe = (offset / pg_block_size) * pg_block_size;
|
||||
uint64_t last_stripe = ((offset + len + pg_block_size - 1) / pg_block_size - 1) * pg_block_size;
|
||||
uint64_t last_stripe = len > 0 ? ((offset + len - 1) / pg_block_size) * pg_block_size : first_stripe;
|
||||
for (uint64_t stripe = first_stripe; stripe <= last_stripe; stripe += pg_block_size)
|
||||
{
|
||||
pg_num_t pg_num = (stripe/pool_cfg.pg_stripe_size) % pool_cfg.real_pg_count + 1; // like map_to_pg()
|
||||
@@ -916,9 +922,12 @@ bool cluster_client_t::try_send(cluster_op_t *op, int i)
|
||||
pool_cfg.scheme == POOL_SCHEME_REPLICATED ? 1 : pool_cfg.pg_size-pool_cfg.parity_chunks
|
||||
);
|
||||
uint64_t meta_rev = 0;
|
||||
auto ino_it = st_cli.inode_config.find(op->inode);
|
||||
if (ino_it != st_cli.inode_config.end())
|
||||
meta_rev = ino_it->second.mod_revision;
|
||||
if (op->opcode != OSD_OP_READ_BITMAP && op->opcode != OSD_OP_DELETE)
|
||||
{
|
||||
auto ino_it = st_cli.inode_config.find(op->inode);
|
||||
if (ino_it != st_cli.inode_config.end())
|
||||
meta_rev = ino_it->second.mod_revision;
|
||||
}
|
||||
part->op = (osd_op_t){
|
||||
.op_type = OSD_OP_OUT,
|
||||
.peer_fd = peer_fd,
|
||||
@@ -926,16 +935,16 @@ bool cluster_client_t::try_send(cluster_op_t *op, int i)
|
||||
.header = {
|
||||
.magic = SECONDARY_OSD_OP_MAGIC,
|
||||
.id = op_id++,
|
||||
.opcode = op->opcode,
|
||||
.opcode = op->opcode == OSD_OP_READ_BITMAP ? OSD_OP_READ : op->opcode,
|
||||
},
|
||||
.inode = op->cur_inode,
|
||||
.offset = part->offset,
|
||||
.len = part->len,
|
||||
.meta_revision = meta_rev,
|
||||
.version = op->opcode == OSD_OP_WRITE ? op->version : 0,
|
||||
.version = op->opcode == OSD_OP_WRITE || op->opcode == OSD_OP_DELETE ? op->version : 0,
|
||||
} },
|
||||
.bitmap = op->opcode == OSD_OP_WRITE ? NULL : op->part_bitmaps + pg_bitmap_size*i,
|
||||
.bitmap_len = (unsigned)(op->opcode == OSD_OP_WRITE ? 0 : pg_bitmap_size),
|
||||
.bitmap = (op->opcode == OSD_OP_READ || op->opcode == OSD_OP_READ_BITMAP ? op->part_bitmaps + pg_bitmap_size*i : NULL),
|
||||
.bitmap_len = (unsigned)(op->opcode == OSD_OP_READ || op->opcode == OSD_OP_READ_BITMAP ? pg_bitmap_size : 0),
|
||||
.callback = [this, part](osd_op_t *op_part)
|
||||
{
|
||||
handle_op_part(part);
|
||||
@@ -1117,7 +1126,7 @@ void cluster_client_t::handle_op_part(cluster_op_part_t *part)
|
||||
dirty_osds.insert(part->osd_num);
|
||||
part->flags |= PART_DONE;
|
||||
op->done_count++;
|
||||
if (op->opcode == OSD_OP_READ)
|
||||
if (op->opcode == OSD_OP_READ || op->opcode == OSD_OP_READ_BITMAP)
|
||||
{
|
||||
copy_part_bitmap(op, part);
|
||||
op->version = op->parts.size() == 1 ? part->op.reply.rw.version : 0;
|
||||
@@ -1141,7 +1150,7 @@ void cluster_client_t::copy_part_bitmap(cluster_op_t *op, cluster_op_part_t *par
|
||||
);
|
||||
uint32_t object_offset = (part->op.req.rw.offset - op->offset) / bs_bitmap_granularity;
|
||||
uint32_t part_offset = (part->op.req.rw.offset % pg_block_size) / bs_bitmap_granularity;
|
||||
uint32_t part_len = part->op.req.rw.len / bs_bitmap_granularity;
|
||||
uint32_t part_len = (op->opcode == OSD_OP_READ_BITMAP ? pg_block_size : part->op.req.rw.len) / bs_bitmap_granularity;
|
||||
if (!(object_offset & 0x7) && !(part_offset & 0x7) && (part_len >= 8))
|
||||
{
|
||||
// Copy bytes
|
||||
@@ -1161,3 +1170,8 @@ void cluster_client_t::copy_part_bitmap(cluster_op_t *op, cluster_op_part_t *par
|
||||
part_len--;
|
||||
}
|
||||
}
|
||||
|
||||
uint64_t cluster_client_t::next_op_id()
|
||||
{
|
||||
return op_id++;
|
||||
}
|
||||
|
@@ -10,6 +10,11 @@
|
||||
#define MAX_BLOCK_SIZE 128*1024*1024
|
||||
#define DEFAULT_CLIENT_MAX_DIRTY_BYTES 32*1024*1024
|
||||
#define DEFAULT_CLIENT_MAX_DIRTY_OPS 1024
|
||||
#define INODE_LIST_DONE 1
|
||||
#define INODE_LIST_HAS_UNSTABLE 2
|
||||
#define OSD_OP_READ_BITMAP OSD_OP_SEC_READ_BMP
|
||||
|
||||
#define OSD_OP_IGNORE_READONLY 0x08
|
||||
|
||||
struct cluster_op_t;
|
||||
|
||||
@@ -27,19 +32,22 @@ struct cluster_op_part_t
|
||||
|
||||
struct cluster_op_t
|
||||
{
|
||||
uint64_t opcode; // OSD_OP_READ, OSD_OP_WRITE, OSD_OP_SYNC
|
||||
uint64_t opcode; // OSD_OP_READ, OSD_OP_WRITE, OSD_OP_SYNC, OSD_OP_DELETE, OSD_OP_READ_BITMAP
|
||||
uint64_t inode;
|
||||
uint64_t offset;
|
||||
uint64_t len;
|
||||
// for reads and writes within a single object (stripe),
|
||||
// reads can return current version and writes can use "CAS" semantics
|
||||
uint64_t version = 0;
|
||||
// now only OSD_OP_IGNORE_READONLY is supported
|
||||
uint64_t flags = 0;
|
||||
int retval;
|
||||
osd_op_buf_list_t iov;
|
||||
// READ and READ_BITMAP return the bitmap here
|
||||
void *bitmap_buf = NULL;
|
||||
std::function<void(cluster_op_t*)> callback;
|
||||
~cluster_op_t();
|
||||
protected:
|
||||
uint64_t flags = 0;
|
||||
int state = 0;
|
||||
uint64_t cur_inode; // for snapshot reads
|
||||
void *buf = NULL;
|
||||
@@ -48,7 +56,7 @@ protected:
|
||||
bool up_wait = false;
|
||||
int inflight_count = 0, done_count = 0;
|
||||
std::vector<cluster_op_part_t> parts;
|
||||
void *bitmap_buf = NULL, *part_bitmaps = NULL;
|
||||
void *part_bitmaps = NULL;
|
||||
unsigned bitmap_buf_size = 0;
|
||||
cluster_op_t *prev = NULL, *next = NULL;
|
||||
int prev_wait = 0;
|
||||
@@ -62,6 +70,9 @@ struct cluster_buffer_t
|
||||
int state;
|
||||
};
|
||||
|
||||
struct inode_list_t;
|
||||
struct inode_list_osd_t;
|
||||
|
||||
// FIXME: Split into public and private interfaces
|
||||
class cluster_client_t
|
||||
{
|
||||
@@ -93,6 +104,7 @@ class cluster_client_t
|
||||
bool pgs_loaded = false;
|
||||
ring_consumer_t consumer;
|
||||
std::vector<std::function<void(void)>> on_ready_hooks;
|
||||
std::vector<inode_list_t*> lists;
|
||||
int continuing_ops = 0;
|
||||
|
||||
public:
|
||||
@@ -108,6 +120,14 @@ public:
|
||||
|
||||
static void copy_write(cluster_op_t *op, std::map<object_id, cluster_buffer_t> & dirty_buffers);
|
||||
void continue_ops(bool up_retry = false);
|
||||
inode_list_t *list_inode_start(inode_t inode,
|
||||
std::function<void(inode_list_t* lst, std::set<object_id>&& objects, pg_num_t pg_num, osd_num_t primary_osd, int status)> callback);
|
||||
int list_pg_count(inode_list_t *lst);
|
||||
void list_inode_next(inode_list_t *lst, int next_pgs);
|
||||
inline uint32_t get_bs_bitmap_granularity() { return bs_bitmap_granularity; }
|
||||
inline uint64_t get_bs_block_size() { return bs_block_size; }
|
||||
uint64_t next_op_id();
|
||||
|
||||
protected:
|
||||
bool affects_osd(uint64_t inode, uint64_t offset, uint64_t len, osd_num_t osd);
|
||||
void flush_buffer(const object_id & oid, cluster_buffer_t *wr);
|
||||
@@ -125,4 +145,7 @@ protected:
|
||||
void erase_op(cluster_op_t *op);
|
||||
void calc_wait(cluster_op_t *op);
|
||||
void inc_wait(uint64_t opcode, uint64_t flags, cluster_op_t *next, int inc);
|
||||
void continue_lists();
|
||||
void continue_listing(inode_list_t *lst);
|
||||
void send_list(inode_list_osd_t *cur_list);
|
||||
};
|
||||
|
285
src/cluster_client_list.cpp
Normal file
285
src/cluster_client_list.cpp
Normal file
@@ -0,0 +1,285 @@
|
||||
// Copyright (c) Vitaliy Filippov, 2019+
|
||||
// License: VNPL-1.1 or GNU GPL-2.0+ (see README.md for details)
|
||||
|
||||
#include <algorithm>
|
||||
#include "pg_states.h"
|
||||
#include "cluster_client.h"
|
||||
|
||||
struct inode_list_t;
|
||||
|
||||
struct inode_list_pg_t;
|
||||
|
||||
struct inode_list_osd_t
|
||||
{
|
||||
inode_list_pg_t *pg = NULL;
|
||||
osd_num_t osd_num = 0;
|
||||
bool sent = false;
|
||||
};
|
||||
|
||||
struct inode_list_pg_t
|
||||
{
|
||||
inode_list_t *lst = NULL;
|
||||
int pos = 0;
|
||||
pg_num_t pg_num;
|
||||
osd_num_t cur_primary;
|
||||
bool has_unstable = false;
|
||||
int sent = 0;
|
||||
int done = 0;
|
||||
std::vector<inode_list_osd_t> list_osds;
|
||||
std::set<object_id> objects;
|
||||
};
|
||||
|
||||
struct inode_list_t
|
||||
{
|
||||
cluster_client_t *cli = NULL;
|
||||
pool_id_t pool_id = 0;
|
||||
inode_t inode = 0;
|
||||
int done_pgs = 0;
|
||||
int want = 0;
|
||||
std::vector<inode_list_pg_t*> pgs;
|
||||
std::function<void(inode_list_t* lst, std::set<object_id>&& objects, pg_num_t pg_num, osd_num_t primary_osd, int status)> callback;
|
||||
};
|
||||
|
||||
inode_list_t* cluster_client_t::list_inode_start(inode_t inode,
|
||||
std::function<void(inode_list_t* lst, std::set<object_id>&& objects, pg_num_t pg_num, osd_num_t primary_osd, int status)> callback)
|
||||
{
|
||||
int skipped_pgs = 0;
|
||||
pool_id_t pool_id = INODE_POOL(inode);
|
||||
if (!pool_id || st_cli.pool_config.find(pool_id) == st_cli.pool_config.end())
|
||||
{
|
||||
if (log_level > 0)
|
||||
{
|
||||
fprintf(stderr, "Pool %u does not exist\n", pool_id);
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
inode_list_t *lst = new inode_list_t();
|
||||
lst->cli = this;
|
||||
lst->pool_id = pool_id;
|
||||
lst->inode = inode;
|
||||
lst->callback = callback;
|
||||
auto pool_cfg = st_cli.pool_config[pool_id];
|
||||
for (auto & pg_item: pool_cfg.pg_config)
|
||||
{
|
||||
auto & pg = pg_item.second;
|
||||
if (pg.pause || !pg.cur_primary || !(pg.cur_state & PG_ACTIVE))
|
||||
{
|
||||
skipped_pgs++;
|
||||
if (log_level > 0)
|
||||
{
|
||||
fprintf(stderr, "PG %u is inactive, skipping\n", pg_item.first);
|
||||
}
|
||||
continue;
|
||||
}
|
||||
inode_list_pg_t *r = new inode_list_pg_t();
|
||||
r->lst = lst;
|
||||
r->pg_num = pg_item.first;
|
||||
r->cur_primary = pg.cur_primary;
|
||||
if (pg.cur_state != PG_ACTIVE)
|
||||
{
|
||||
// Not clean
|
||||
std::set<osd_num_t> all_peers;
|
||||
for (osd_num_t pg_osd: pg.target_set)
|
||||
{
|
||||
if (pg_osd != 0)
|
||||
{
|
||||
all_peers.insert(pg_osd);
|
||||
}
|
||||
}
|
||||
for (osd_num_t pg_osd: pg.all_peers)
|
||||
{
|
||||
if (pg_osd != 0)
|
||||
{
|
||||
all_peers.insert(pg_osd);
|
||||
}
|
||||
}
|
||||
for (auto & hist_item: pg.target_history)
|
||||
{
|
||||
for (auto pg_osd: hist_item)
|
||||
{
|
||||
if (pg_osd != 0)
|
||||
{
|
||||
all_peers.insert(pg_osd);
|
||||
}
|
||||
}
|
||||
}
|
||||
for (osd_num_t peer_osd: all_peers)
|
||||
{
|
||||
r->list_osds.push_back((inode_list_osd_t){
|
||||
.pg = r,
|
||||
.osd_num = peer_osd,
|
||||
.sent = false,
|
||||
});
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// Clean
|
||||
r->list_osds.push_back((inode_list_osd_t){
|
||||
.pg = r,
|
||||
.osd_num = pg.cur_primary,
|
||||
.sent = false,
|
||||
});
|
||||
}
|
||||
lst->pgs.push_back(r);
|
||||
}
|
||||
std::sort(lst->pgs.begin(), lst->pgs.end(), [](inode_list_pg_t *a, inode_list_pg_t *b)
|
||||
{
|
||||
return a->cur_primary < b->cur_primary ? true : false;
|
||||
});
|
||||
for (int i = 0; i < lst->pgs.size(); i++)
|
||||
{
|
||||
lst->pgs[i]->pos = i;
|
||||
}
|
||||
lists.push_back(lst);
|
||||
return lst;
|
||||
}
|
||||
|
||||
int cluster_client_t::list_pg_count(inode_list_t *lst)
|
||||
{
|
||||
return lst->pgs.size();
|
||||
}
|
||||
|
||||
void cluster_client_t::list_inode_next(inode_list_t *lst, int next_pgs)
|
||||
{
|
||||
if (next_pgs >= 0)
|
||||
{
|
||||
lst->want += next_pgs;
|
||||
}
|
||||
continue_listing(lst);
|
||||
}
|
||||
|
||||
void cluster_client_t::continue_listing(inode_list_t *lst)
|
||||
{
|
||||
if (lst->done_pgs >= lst->pgs.size())
|
||||
{
|
||||
// All done
|
||||
for (int i = 0; i < lists.size(); i++)
|
||||
{
|
||||
if (lists[i] == lst)
|
||||
{
|
||||
lists.erase(lists.begin()+i, lists.begin()+i+1);
|
||||
break;
|
||||
}
|
||||
}
|
||||
delete lst;
|
||||
return;
|
||||
}
|
||||
if (lst->want <= 0)
|
||||
{
|
||||
return;
|
||||
}
|
||||
for (int i = 0; i < lst->pgs.size(); i++)
|
||||
{
|
||||
if (lst->pgs[i] && lst->pgs[i]->sent < lst->pgs[i]->list_osds.size())
|
||||
{
|
||||
for (int j = 0; j < lst->pgs[i]->list_osds.size(); j++)
|
||||
{
|
||||
send_list(&lst->pgs[i]->list_osds[j]);
|
||||
if (lst->want <= 0)
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void cluster_client_t::send_list(inode_list_osd_t *cur_list)
|
||||
{
|
||||
if (cur_list->sent)
|
||||
{
|
||||
return;
|
||||
}
|
||||
if (msgr.osd_peer_fds.find(cur_list->osd_num) == msgr.osd_peer_fds.end())
|
||||
{
|
||||
// Initiate connection
|
||||
msgr.connect_peer(cur_list->osd_num, st_cli.peer_states[cur_list->osd_num]);
|
||||
return;
|
||||
}
|
||||
auto & pool_cfg = st_cli.pool_config[cur_list->pg->lst->pool_id];
|
||||
osd_op_t *op = new osd_op_t();
|
||||
op->op_type = OSD_OP_OUT;
|
||||
op->peer_fd = msgr.osd_peer_fds[cur_list->osd_num];
|
||||
op->req = (osd_any_op_t){
|
||||
.sec_list = {
|
||||
.header = {
|
||||
.magic = SECONDARY_OSD_OP_MAGIC,
|
||||
.id = op_id++,
|
||||
.opcode = OSD_OP_SEC_LIST,
|
||||
},
|
||||
.list_pg = cur_list->pg->pg_num,
|
||||
.pg_count = (pg_num_t)pool_cfg.real_pg_count,
|
||||
.pg_stripe_size = pool_cfg.pg_stripe_size,
|
||||
.min_inode = cur_list->pg->lst->inode,
|
||||
.max_inode = cur_list->pg->lst->inode,
|
||||
},
|
||||
};
|
||||
op->callback = [this, cur_list](osd_op_t *op)
|
||||
{
|
||||
if (op->reply.hdr.retval < 0)
|
||||
{
|
||||
fprintf(stderr, "Failed to get PG %u/%u object list from OSD %lu (retval=%ld), skipping\n",
|
||||
cur_list->pg->lst->pool_id, cur_list->pg->pg_num, cur_list->osd_num, op->reply.hdr.retval);
|
||||
}
|
||||
else
|
||||
{
|
||||
if (op->reply.sec_list.stable_count < op->reply.hdr.retval)
|
||||
{
|
||||
// Unstable objects, if present, mean that someone still writes into the inode. Warn the user about it.
|
||||
cur_list->pg->has_unstable = true;
|
||||
fprintf(
|
||||
stderr, "[PG %u/%u] Inode still has %lu unstable object versions out of total %lu - is it still open?\n",
|
||||
cur_list->pg->lst->pool_id, cur_list->pg->pg_num, op->reply.hdr.retval - op->reply.sec_list.stable_count,
|
||||
op->reply.hdr.retval
|
||||
);
|
||||
}
|
||||
if (log_level > 0)
|
||||
{
|
||||
fprintf(
|
||||
stderr, "[PG %u/%u] Got inode object list from OSD %lu: %ld object versions\n",
|
||||
cur_list->pg->lst->pool_id, cur_list->pg->pg_num, cur_list->osd_num, op->reply.hdr.retval
|
||||
);
|
||||
}
|
||||
for (uint64_t i = 0; i < op->reply.hdr.retval; i++)
|
||||
{
|
||||
object_id oid = ((obj_ver_id*)op->buf)[i].oid;
|
||||
oid.stripe = oid.stripe & ~STRIPE_MASK;
|
||||
cur_list->pg->objects.insert(oid);
|
||||
}
|
||||
}
|
||||
delete op;
|
||||
auto lst = cur_list->pg->lst;
|
||||
auto pg = cur_list->pg;
|
||||
pg->done++;
|
||||
if (pg->done >= pg->list_osds.size())
|
||||
{
|
||||
int status = 0;
|
||||
lst->done_pgs++;
|
||||
if (lst->done_pgs >= lst->pgs.size())
|
||||
{
|
||||
status |= INODE_LIST_DONE;
|
||||
}
|
||||
if (pg->has_unstable)
|
||||
{
|
||||
status |= INODE_LIST_HAS_UNSTABLE;
|
||||
}
|
||||
lst->callback(lst, std::move(pg->objects), pg->pg_num, pg->cur_primary, status);
|
||||
lst->pgs[pg->pos] = NULL;
|
||||
delete pg;
|
||||
}
|
||||
continue_listing(lst);
|
||||
};
|
||||
msgr.outbox_push(op);
|
||||
cur_list->sent = true;
|
||||
cur_list->pg->sent++;
|
||||
cur_list->pg->lst->want--;
|
||||
}
|
||||
|
||||
void cluster_client_t::continue_lists()
|
||||
{
|
||||
for (auto lst: lists)
|
||||
{
|
||||
continue_listing(lst);
|
||||
}
|
||||
}
|
@@ -765,3 +765,22 @@ void etcd_state_client_t::close_watch(inode_watch_t* watch)
|
||||
}
|
||||
delete watch;
|
||||
}
|
||||
|
||||
json11::Json::object & etcd_state_client_t::serialize_inode_cfg(inode_config_t *cfg)
|
||||
{
|
||||
json11::Json::object new_cfg = json11::Json::object {
|
||||
{ "name", cfg->name },
|
||||
{ "size", cfg->size },
|
||||
};
|
||||
if (cfg->parent_id)
|
||||
{
|
||||
if (INODE_POOL(cfg->num) != INODE_POOL(cfg->parent_id))
|
||||
new_cfg["parent_pool"] = (uint64_t)INODE_POOL(cfg->parent_id);
|
||||
new_cfg["parent_id"] = (uint64_t)INODE_NO_POOL(cfg->parent_id);
|
||||
}
|
||||
if (cfg->readonly)
|
||||
{
|
||||
new_cfg["readonly"] = true;
|
||||
}
|
||||
return new_cfg;
|
||||
}
|
||||
|
@@ -99,6 +99,7 @@ public:
|
||||
std::function<void(pool_id_t, pg_num_t)> on_change_pg_history_hook;
|
||||
std::function<void(osd_num_t)> on_change_osd_state_hook;
|
||||
|
||||
json11::Json::object & serialize_inode_cfg(inode_config_t *cfg);
|
||||
etcd_kv_t parse_etcd_kv(const json11::Json & kv_json);
|
||||
void etcd_call(std::string api, json11::Json payload, int timeout, std::function<void(std::string, json11::Json)> callback);
|
||||
void etcd_txn(json11::Json txn, int timeout, std::function<void(std::string, json11::Json)> callback);
|
||||
|
@@ -117,7 +117,7 @@ osd_messenger_t::~osd_messenger_t()
|
||||
}
|
||||
while (clients.size() > 0)
|
||||
{
|
||||
stop_client(clients.begin()->first, true);
|
||||
stop_client(clients.begin()->first, true, true);
|
||||
}
|
||||
#ifdef WITH_RDMA
|
||||
if (rdma_context)
|
||||
|
@@ -156,7 +156,7 @@ public:
|
||||
void init();
|
||||
void parse_config(const json11::Json & config);
|
||||
void connect_peer(uint64_t osd_num, json11::Json peer_state);
|
||||
void stop_client(int peer_fd, bool force = false);
|
||||
void stop_client(int peer_fd, bool force = false, bool force_delete = false);
|
||||
void outbox_push(osd_op_t *cur_op);
|
||||
std::function<void(osd_op_t*)> exec_op;
|
||||
std::function<void(osd_num_t)> repeer_pgs;
|
||||
|
@@ -15,7 +15,7 @@ osd_messenger_t::~osd_messenger_t()
|
||||
{
|
||||
while (clients.size() > 0)
|
||||
{
|
||||
stop_client(clients.begin()->first, true);
|
||||
stop_client(clients.begin()->first, true, true);
|
||||
}
|
||||
}
|
||||
|
||||
|
@@ -46,6 +46,9 @@ msgr_rdma_connection_t::~msgr_rdma_connection_t()
|
||||
ctx->used_max_cqe -= max_send+max_recv;
|
||||
if (qp)
|
||||
ibv_destroy_qp(qp);
|
||||
if (recv_buffers.size())
|
||||
for (auto b: recv_buffers)
|
||||
free(b);
|
||||
}
|
||||
|
||||
msgr_rdma_context_t *msgr_rdma_context_t::create(const char *ib_devname, uint8_t ib_port, uint8_t gid_index, uint32_t mtu)
|
||||
@@ -55,6 +58,7 @@ msgr_rdma_context_t *msgr_rdma_context_t::create(const char *ib_devname, uint8_t
|
||||
msgr_rdma_context_t *ctx = new msgr_rdma_context_t();
|
||||
ctx->mtu = mtu;
|
||||
|
||||
srand48(time(NULL));
|
||||
dev_list = ibv_get_device_list(NULL);
|
||||
if (!dev_list)
|
||||
{
|
||||
@@ -477,7 +481,11 @@ void osd_messenger_t::handle_rdma_events()
|
||||
if (!is_send)
|
||||
{
|
||||
cl->rdma_conn->cur_recv--;
|
||||
handle_read_buffer(cl, cl->rdma_conn->recv_buffers[0], wc[i].byte_len);
|
||||
if (!handle_read_buffer(cl, cl->rdma_conn->recv_buffers[0], wc[i].byte_len))
|
||||
{
|
||||
// handle_read_buffer may stop the client
|
||||
continue;
|
||||
}
|
||||
free(cl->rdma_conn->recv_buffers[0]);
|
||||
cl->rdma_conn->recv_buffers.erase(cl->rdma_conn->recv_buffers.begin(), cl->rdma_conn->recv_buffers.begin()+1);
|
||||
try_recv_rdma(cl);
|
||||
|
@@ -41,7 +41,7 @@ void osd_messenger_t::cancel_op(osd_op_t *op)
|
||||
}
|
||||
}
|
||||
|
||||
void osd_messenger_t::stop_client(int peer_fd, bool force)
|
||||
void osd_messenger_t::stop_client(int peer_fd, bool force, bool force_delete)
|
||||
{
|
||||
assert(peer_fd != 0);
|
||||
auto it = clients.find(peer_fd);
|
||||
@@ -136,7 +136,7 @@ void osd_messenger_t::stop_client(int peer_fd, bool force)
|
||||
clients.erase(it);
|
||||
}
|
||||
cl->refs--;
|
||||
if (cl->refs <= 0)
|
||||
if (cl->refs <= 0 || force_delete)
|
||||
{
|
||||
delete cl;
|
||||
}
|
||||
|
26
src/osd.cpp
26
src/osd.cpp
@@ -7,6 +7,8 @@
|
||||
#include <netinet/tcp.h>
|
||||
#include <arpa/inet.h>
|
||||
|
||||
#include "blockstore_impl.h"
|
||||
#include "osd_primary.h"
|
||||
#include "osd.h"
|
||||
#include "http_client.h"
|
||||
|
||||
@@ -365,6 +367,7 @@ void osd_t::print_stats()
|
||||
|
||||
void osd_t::print_slow()
|
||||
{
|
||||
bool has_slow = false;
|
||||
char alloc[1024];
|
||||
timespec now;
|
||||
clock_gettime(CLOCK_REALTIME, &now);
|
||||
@@ -426,9 +429,32 @@ void osd_t::print_slow()
|
||||
{
|
||||
bufprintf(" inode=%lx offset=%lx len=%x", op->req.rw.inode, op->req.rw.offset, op->req.rw.len);
|
||||
}
|
||||
if (op->req.hdr.opcode == OSD_OP_SEC_READ || op->req.hdr.opcode == OSD_OP_SEC_WRITE ||
|
||||
op->req.hdr.opcode == OSD_OP_SEC_WRITE_STABLE || op->req.hdr.opcode == OSD_OP_SEC_DELETE ||
|
||||
op->req.hdr.opcode == OSD_OP_SEC_SYNC || op->req.hdr.opcode == OSD_OP_SEC_LIST ||
|
||||
op->req.hdr.opcode == OSD_OP_SEC_STABILIZE || op->req.hdr.opcode == OSD_OP_SEC_ROLLBACK ||
|
||||
op->req.hdr.opcode == OSD_OP_SEC_READ_BMP)
|
||||
{
|
||||
bufprintf(" state=%d", PRIV(op->bs_op)->op_state);
|
||||
int wait_for = PRIV(op->bs_op)->wait_for;
|
||||
if (wait_for)
|
||||
{
|
||||
bufprintf(" wait=%d (detail=%lu)", wait_for, PRIV(op->bs_op)->wait_detail);
|
||||
}
|
||||
}
|
||||
else if (op->req.hdr.opcode == OSD_OP_READ || op->req.hdr.opcode == OSD_OP_WRITE ||
|
||||
op->req.hdr.opcode == OSD_OP_SYNC || op->req.hdr.opcode == OSD_OP_DELETE)
|
||||
{
|
||||
bufprintf(" state=%d", !op->op_data ? -1 : op->op_data->st);
|
||||
}
|
||||
#undef bufprintf
|
||||
printf("%s\n", alloc);
|
||||
has_slow = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (has_slow)
|
||||
{
|
||||
bs->dump_diagnostics();
|
||||
}
|
||||
}
|
||||
|
@@ -9,6 +9,8 @@
|
||||
#define POOL_ID_MAX 0x10000
|
||||
#define POOL_ID_BITS 16
|
||||
#define INODE_POOL(inode) (pool_id_t)((inode) >> (64 - POOL_ID_BITS))
|
||||
#define INODE_NO_POOL(inode) (inode_t)(inode & ((1l << (64-POOL_ID_BITS)) - 1))
|
||||
#define INODE_WITH_POOL(pool_id, inode) (((inode_t)(pool_id) << (64-POOL_ID_BITS)) | INODE_NO_POOL(inode))
|
||||
|
||||
// Pool ID is 16 bits long
|
||||
typedef uint32_t pool_id_t;
|
||||
|
@@ -198,7 +198,7 @@ void osd_t::continue_primary_read(osd_op_t *cur_op)
|
||||
{
|
||||
// Fast happy-path
|
||||
cur_op->buf = alloc_read_buffer(op_data->stripes, op_data->pg_data_size, 0);
|
||||
submit_primary_subops(SUBMIT_READ, op_data->target_ver, pg.cur_set.data(), cur_op);
|
||||
submit_primary_subops(SUBMIT_RMW_READ, op_data->target_ver, pg.cur_set.data(), cur_op);
|
||||
op_data->st = 1;
|
||||
}
|
||||
else
|
||||
@@ -215,7 +215,7 @@ void osd_t::continue_primary_read(osd_op_t *cur_op)
|
||||
op_data->scheme = pg.scheme;
|
||||
op_data->degraded = 1;
|
||||
cur_op->buf = alloc_read_buffer(op_data->stripes, pg.pg_size, 0);
|
||||
submit_primary_subops(SUBMIT_READ, op_data->target_ver, cur_set, cur_op);
|
||||
submit_primary_subops(SUBMIT_RMW_READ, op_data->target_ver, cur_set, cur_op);
|
||||
op_data->st = 1;
|
||||
}
|
||||
}
|
||||
@@ -353,6 +353,7 @@ resume_3:
|
||||
if (cur_op->req.rw.version && op_data->fact_ver != (cur_op->req.rw.version-1))
|
||||
{
|
||||
cur_op->reply.hdr.retval = -EINTR;
|
||||
cur_op->reply.rw.version = op_data->fact_ver;
|
||||
goto continue_others;
|
||||
}
|
||||
// Save version override for parallel reads
|
||||
|
@@ -100,6 +100,7 @@ resume_3:
|
||||
if (cur_op->req.rw.version && op_data->fact_ver != (cur_op->req.rw.version-1))
|
||||
{
|
||||
cur_op->reply.hdr.retval = -EINTR;
|
||||
cur_op->reply.rw.version = op_data->fact_ver;
|
||||
goto continue_others;
|
||||
}
|
||||
if (op_data->scheme == POOL_SCHEME_REPLICATED)
|
||||
@@ -260,6 +261,7 @@ resume_9:
|
||||
}
|
||||
}
|
||||
cur_op->reply.hdr.retval = cur_op->req.rw.len;
|
||||
cur_op->reply.rw.version = op_data->fact_ver;
|
||||
continue_others:
|
||||
osd_op_t *next_op = NULL;
|
||||
auto next_it = pg.write_queue.find(op_data->oid);
|
||||
|
410
src/rm_inode.cpp
410
src/rm_inode.cpp
@@ -1,410 +0,0 @@
|
||||
// Copyright (c) Vitaliy Filippov, 2019+
|
||||
// License: VNPL-1.1 (see README.md for details)
|
||||
|
||||
/**
|
||||
* Inode removal tool
|
||||
* May be included into a bigger "command-line management interface" in the future
|
||||
*/
|
||||
|
||||
#include <vector>
|
||||
#include <algorithm>
|
||||
|
||||
#include "epoll_manager.h"
|
||||
#include "cluster_client.h"
|
||||
#include "pg_states.h"
|
||||
|
||||
#define RM_LISTING 1
|
||||
#define RM_REMOVING 2
|
||||
#define RM_END 3
|
||||
|
||||
const char *exe_name = NULL;
|
||||
|
||||
struct rm_pg_t;
|
||||
|
||||
struct rm_pg_osd_t
|
||||
{
|
||||
rm_pg_t *pg = NULL;
|
||||
osd_num_t osd_num;
|
||||
bool sent = false;
|
||||
};
|
||||
|
||||
struct rm_pg_t
|
||||
{
|
||||
pg_num_t pg_num;
|
||||
osd_num_t rm_osd_num;
|
||||
std::vector<rm_pg_osd_t> list_osds;
|
||||
int state = 0;
|
||||
int to_list;
|
||||
std::set<object_id> objects;
|
||||
std::set<object_id>::iterator obj_pos;
|
||||
uint64_t obj_count = 0, obj_done = 0, obj_prev_done = 0;
|
||||
int in_flight = 0;
|
||||
};
|
||||
|
||||
class rm_inode_t
|
||||
{
|
||||
protected:
|
||||
uint64_t inode = 0;
|
||||
pool_id_t pool_id = 0;
|
||||
uint64_t iodepth = 0, parallel_osds = 0;
|
||||
|
||||
ring_loop_t *ringloop = NULL;
|
||||
epoll_manager_t *epmgr = NULL;
|
||||
cluster_client_t *cli = NULL;
|
||||
ring_consumer_t consumer;
|
||||
|
||||
std::vector<rm_pg_t*> lists;
|
||||
uint64_t total_count = 0, total_done = 0, total_prev_pct = 0;
|
||||
uint64_t pgs_to_list = 0;
|
||||
bool started = false;
|
||||
bool progress = true;
|
||||
bool list_first = false;
|
||||
int log_level = 0;
|
||||
|
||||
public:
|
||||
static json11::Json::object parse_args(int narg, const char *args[])
|
||||
{
|
||||
json11::Json::object cfg;
|
||||
cfg["progress"] = "1";
|
||||
for (int i = 1; i < narg; i++)
|
||||
{
|
||||
if (!strcmp(args[i], "-h") || !strcmp(args[i], "--help"))
|
||||
{
|
||||
help();
|
||||
}
|
||||
else if (args[i][0] == '-' && args[i][1] == '-')
|
||||
{
|
||||
const char *opt = args[i]+2;
|
||||
cfg[opt] = !strcmp(opt, "json") || !strcmp(opt, "wait-list") || i == narg-1 ? "1" : args[++i];
|
||||
}
|
||||
}
|
||||
return cfg;
|
||||
}
|
||||
|
||||
static void help()
|
||||
{
|
||||
printf(
|
||||
"Vitastor inode removal tool\n"
|
||||
"(c) Vitaliy Filippov, 2020 (VNPL-1.1)\n\n"
|
||||
"USAGE:\n"
|
||||
" %s [--etcd_address <etcd_address>] --pool <pool> --inode <inode> [--wait-list]\n",
|
||||
exe_name
|
||||
);
|
||||
exit(0);
|
||||
}
|
||||
|
||||
void run(json11::Json cfg)
|
||||
{
|
||||
inode = cfg["inode"].uint64_value();
|
||||
pool_id = cfg["pool"].uint64_value();
|
||||
if (pool_id)
|
||||
inode = (inode & ((1l << (64-POOL_ID_BITS)) - 1)) | (((uint64_t)pool_id) << (64-POOL_ID_BITS));
|
||||
pool_id = INODE_POOL(inode);
|
||||
if (!pool_id)
|
||||
{
|
||||
fprintf(stderr, "pool is missing");
|
||||
exit(1);
|
||||
}
|
||||
iodepth = cfg["iodepth"].uint64_value();
|
||||
if (!iodepth)
|
||||
iodepth = 32;
|
||||
parallel_osds = cfg["parallel_osds"].uint64_value();
|
||||
if (!parallel_osds)
|
||||
parallel_osds = 4;
|
||||
log_level = cfg["log_level"].int64_value();
|
||||
progress = cfg["progress"].uint64_value() ? true : false;
|
||||
list_first = cfg["wait-list"].uint64_value() ? true : false;
|
||||
// Create client
|
||||
ringloop = new ring_loop_t(512);
|
||||
epmgr = new epoll_manager_t(ringloop);
|
||||
cli = new cluster_client_t(ringloop, epmgr->tfd, cfg);
|
||||
cli->on_ready([this]() { start_delete(); });
|
||||
// Initialize job
|
||||
consumer.loop = [this]()
|
||||
{
|
||||
if (started)
|
||||
continue_delete();
|
||||
ringloop->submit();
|
||||
};
|
||||
ringloop->register_consumer(&consumer);
|
||||
// Loop until it completes
|
||||
while (1)
|
||||
{
|
||||
ringloop->loop();
|
||||
ringloop->wait();
|
||||
}
|
||||
}
|
||||
|
||||
void start_delete()
|
||||
{
|
||||
if (cli->st_cli.pool_config.find(pool_id) == cli->st_cli.pool_config.end())
|
||||
{
|
||||
fprintf(stderr, "Pool %u does not exist\n", pool_id);
|
||||
exit(1);
|
||||
}
|
||||
auto pool_cfg = cli->st_cli.pool_config[pool_id];
|
||||
for (auto & pg_item: pool_cfg.pg_config)
|
||||
{
|
||||
auto & pg = pg_item.second;
|
||||
if (pg.pause || !pg.cur_primary || !(pg.cur_state & PG_ACTIVE))
|
||||
{
|
||||
fprintf(stderr, "PG %u is inactive, skipping\n", pg_item.first);
|
||||
continue;
|
||||
}
|
||||
rm_pg_t *r = new rm_pg_t();
|
||||
r->pg_num = pg_item.first;
|
||||
r->rm_osd_num = pg.cur_primary;
|
||||
r->state = RM_LISTING;
|
||||
if (pg.cur_state != PG_ACTIVE)
|
||||
{
|
||||
std::set<osd_num_t> all_peers;
|
||||
for (osd_num_t pg_osd: pg.target_set)
|
||||
{
|
||||
if (pg_osd != 0)
|
||||
{
|
||||
all_peers.insert(pg_osd);
|
||||
}
|
||||
}
|
||||
for (osd_num_t pg_osd: pg.all_peers)
|
||||
{
|
||||
if (pg_osd != 0)
|
||||
{
|
||||
all_peers.insert(pg_osd);
|
||||
}
|
||||
}
|
||||
for (auto & hist_item: pg.target_history)
|
||||
{
|
||||
for (auto pg_osd: hist_item)
|
||||
{
|
||||
if (pg_osd != 0)
|
||||
{
|
||||
all_peers.insert(pg_osd);
|
||||
}
|
||||
}
|
||||
}
|
||||
for (osd_num_t peer_osd: all_peers)
|
||||
{
|
||||
r->list_osds.push_back((rm_pg_osd_t){ .pg = r, .osd_num = peer_osd, .sent = false });
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
r->list_osds.push_back((rm_pg_osd_t){ .pg = r, .osd_num = pg.cur_primary, .sent = false });
|
||||
}
|
||||
r->to_list = r->list_osds.size();
|
||||
lists.push_back(r);
|
||||
}
|
||||
std::sort(lists.begin(), lists.end(), [](rm_pg_t *a, rm_pg_t *b)
|
||||
{
|
||||
return a->rm_osd_num < b->rm_osd_num ? true : false;
|
||||
});
|
||||
pgs_to_list = lists.size();
|
||||
started = true;
|
||||
continue_delete();
|
||||
}
|
||||
|
||||
void send_list(rm_pg_osd_t *cur_list)
|
||||
{
|
||||
if (cur_list->sent)
|
||||
{
|
||||
return;
|
||||
}
|
||||
if (cli->msgr.osd_peer_fds.find(cur_list->osd_num) ==
|
||||
cli->msgr.osd_peer_fds.end())
|
||||
{
|
||||
// Initiate connection
|
||||
cli->msgr.connect_peer(cur_list->osd_num, cli->st_cli.peer_states[cur_list->osd_num]);
|
||||
return;
|
||||
}
|
||||
osd_op_t *op = new osd_op_t();
|
||||
op->op_type = OSD_OP_OUT;
|
||||
op->peer_fd = cli->msgr.osd_peer_fds[cur_list->osd_num];
|
||||
op->req = (osd_any_op_t){
|
||||
.sec_list = {
|
||||
.header = {
|
||||
.magic = SECONDARY_OSD_OP_MAGIC,
|
||||
.id = cli->msgr.next_subop_id++,
|
||||
.opcode = OSD_OP_SEC_LIST,
|
||||
},
|
||||
.list_pg = cur_list->pg->pg_num,
|
||||
.pg_count = (pg_num_t)cli->st_cli.pool_config[pool_id].real_pg_count,
|
||||
.pg_stripe_size = cli->st_cli.pool_config[pool_id].pg_stripe_size,
|
||||
.min_inode = inode,
|
||||
.max_inode = inode,
|
||||
},
|
||||
};
|
||||
op->callback = [this, cur_list](osd_op_t *op)
|
||||
{
|
||||
cur_list->pg->to_list--;
|
||||
if (op->reply.hdr.retval < 0)
|
||||
{
|
||||
fprintf(stderr, "Failed to get PG %u/%u object list from OSD %lu (retval=%ld), skipping\n",
|
||||
pool_id, cur_list->pg->pg_num, cur_list->osd_num, op->reply.hdr.retval);
|
||||
}
|
||||
else
|
||||
{
|
||||
if (op->reply.sec_list.stable_count < op->reply.hdr.retval)
|
||||
{
|
||||
// Unstable objects, if present, mean that someone still writes into the inode. Warn the user about it.
|
||||
printf(
|
||||
"[PG %u/%u] Inode still has %lu unstable object versions - is it still open? Not a good idea to delete it.\n",
|
||||
pool_id, cur_list->pg->pg_num, op->reply.hdr.retval - op->reply.sec_list.stable_count
|
||||
);
|
||||
}
|
||||
if (log_level > 0)
|
||||
{
|
||||
printf(
|
||||
"[PG %u/%u] Got inode object list from OSD %lu: %ld object versions\n",
|
||||
pool_id, cur_list->pg->pg_num, cur_list->osd_num, op->reply.hdr.retval
|
||||
);
|
||||
}
|
||||
for (uint64_t i = 0; i < op->reply.hdr.retval; i++)
|
||||
{
|
||||
object_id oid = ((obj_ver_id*)op->buf)[i].oid;
|
||||
oid.stripe = oid.stripe & ~STRIPE_MASK;
|
||||
cur_list->pg->objects.insert(oid);
|
||||
}
|
||||
}
|
||||
delete op;
|
||||
if (cur_list->pg->to_list <= 0)
|
||||
{
|
||||
cur_list->pg->obj_done = cur_list->pg->obj_prev_done = 0;
|
||||
cur_list->pg->obj_pos = cur_list->pg->objects.begin();
|
||||
cur_list->pg->obj_count = cur_list->pg->objects.size();
|
||||
total_count += cur_list->pg->obj_count;
|
||||
total_prev_pct = 0;
|
||||
cur_list->pg->state = RM_REMOVING;
|
||||
pgs_to_list--;
|
||||
}
|
||||
continue_delete();
|
||||
};
|
||||
cli->msgr.outbox_push(op);
|
||||
cur_list->sent = true;
|
||||
}
|
||||
|
||||
void send_ops(rm_pg_t *cur_list)
|
||||
{
|
||||
if (cli->msgr.osd_peer_fds.find(cur_list->rm_osd_num) ==
|
||||
cli->msgr.osd_peer_fds.end())
|
||||
{
|
||||
// Initiate connection
|
||||
cli->msgr.connect_peer(cur_list->rm_osd_num, cli->st_cli.peer_states[cur_list->rm_osd_num]);
|
||||
return;
|
||||
}
|
||||
while (cur_list->in_flight < iodepth && cur_list->obj_pos != cur_list->objects.end())
|
||||
{
|
||||
osd_op_t *op = new osd_op_t();
|
||||
op->op_type = OSD_OP_OUT;
|
||||
op->peer_fd = cli->msgr.osd_peer_fds[cur_list->rm_osd_num];
|
||||
op->req = (osd_any_op_t){
|
||||
.rw = {
|
||||
.header = {
|
||||
.magic = SECONDARY_OSD_OP_MAGIC,
|
||||
.id = cli->msgr.next_subop_id++,
|
||||
.opcode = OSD_OP_DELETE,
|
||||
},
|
||||
.inode = cur_list->obj_pos->inode,
|
||||
.offset = (cur_list->obj_pos->stripe & ~STRIPE_MASK),
|
||||
.len = 0,
|
||||
},
|
||||
};
|
||||
op->callback = [this, cur_list](osd_op_t *op)
|
||||
{
|
||||
cur_list->in_flight--;
|
||||
if (op->reply.hdr.retval < 0)
|
||||
{
|
||||
fprintf(stderr, "Failed to remove object from PG %u (OSD %lu) (retval=%ld)\n",
|
||||
cur_list->pg_num, cur_list->rm_osd_num, op->reply.hdr.retval);
|
||||
}
|
||||
delete op;
|
||||
cur_list->obj_done++;
|
||||
total_done++;
|
||||
continue_delete();
|
||||
};
|
||||
cli->msgr.outbox_push(op);
|
||||
cur_list->obj_pos++;
|
||||
cur_list->in_flight++;
|
||||
}
|
||||
if (!cur_list->in_flight && cur_list->obj_pos == cur_list->objects.end())
|
||||
{
|
||||
cur_list->obj_count = 0;
|
||||
cur_list->obj_done = cur_list->obj_prev_done = 0;
|
||||
cur_list->state = RM_END;
|
||||
}
|
||||
}
|
||||
|
||||
void continue_delete()
|
||||
{
|
||||
int par_osd = 0;
|
||||
osd_num_t max_seen_osd = 0;
|
||||
bool no_del = false;
|
||||
if (list_first)
|
||||
{
|
||||
int i, n = 0;
|
||||
for (i = 0; i < lists.size(); i++)
|
||||
{
|
||||
if (lists[i]->state == RM_LISTING)
|
||||
{
|
||||
n++;
|
||||
}
|
||||
}
|
||||
if (n > 0)
|
||||
{
|
||||
no_del = true;
|
||||
}
|
||||
}
|
||||
for (int i = 0; i < lists.size(); i++)
|
||||
{
|
||||
if (lists[i]->state == RM_END)
|
||||
{
|
||||
delete lists[i];
|
||||
lists.erase(lists.begin()+i, lists.begin()+i+1);
|
||||
i--;
|
||||
}
|
||||
else if (lists[i]->rm_osd_num > max_seen_osd)
|
||||
{
|
||||
if (lists[i]->state == RM_LISTING)
|
||||
{
|
||||
for (int j = 0; j < lists[i]->list_osds.size(); j++)
|
||||
{
|
||||
send_list(&lists[i]->list_osds[j]);
|
||||
}
|
||||
}
|
||||
else if (lists[i]->state == RM_REMOVING)
|
||||
{
|
||||
if (no_del)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
send_ops(lists[i]);
|
||||
}
|
||||
par_osd++;
|
||||
max_seen_osd = lists[i]->rm_osd_num;
|
||||
if (par_osd >= parallel_osds)
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (progress && total_count > 0 && total_done*1000/total_count != total_prev_pct)
|
||||
{
|
||||
printf("\rRemoved %lu/%lu objects, %lu more PGs to list...", total_done, total_count, pgs_to_list);
|
||||
total_prev_pct = total_done*1000/total_count;
|
||||
}
|
||||
if (!lists.size())
|
||||
{
|
||||
printf("Done, inode %lu in pool %u removed\n", (inode & ((1l << (64-POOL_ID_BITS)) - 1)), pool_id);
|
||||
exit(0);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
int main(int narg, const char *args[])
|
||||
{
|
||||
setvbuf(stdout, NULL, _IONBF, 0);
|
||||
setvbuf(stderr, NULL, _IONBF, 0);
|
||||
exe_name = args[0];
|
||||
rm_inode_t *p = new rm_inode_t();
|
||||
p->run(rm_inode_t::parse_args(narg, args));
|
||||
return 0;
|
||||
}
|
@@ -72,7 +72,7 @@ static void vitastor_c_write_handler(void *opaque)
|
||||
|
||||
vitastor_c *vitastor_c_create_qemu(QEMUSetFDHandler *aio_set_fd_handler, void *aio_context,
|
||||
const char *config_path, const char *etcd_host, const char *etcd_prefix,
|
||||
bool use_rdma, const char *rdma_device, int rdma_port_num, int rdma_gid_index, int rdma_mtu, int log_level)
|
||||
int use_rdma, const char *rdma_device, int rdma_port_num, int rdma_gid_index, int rdma_mtu, int log_level)
|
||||
{
|
||||
json11::Json cfg_json = vitastor_c_common_config(
|
||||
config_path, etcd_host, etcd_prefix, use_rdma,
|
||||
|
@@ -28,7 +28,7 @@ typedef void QEMUSetFDHandler(void *ctx, int fd, int is_external, IOHandler *fd_
|
||||
|
||||
vitastor_c *vitastor_c_create_qemu(QEMUSetFDHandler *aio_set_fd_handler, void *aio_context,
|
||||
const char *config_path, const char *etcd_host, const char *etcd_prefix,
|
||||
bool use_rdma, const char *rdma_device, int rdma_port_num, int rdma_gid_index, int rdma_mtu, int log_level);
|
||||
int use_rdma, const char *rdma_device, int rdma_port_num, int rdma_gid_index, int rdma_mtu, int log_level);
|
||||
vitastor_c *vitastor_c_create_uring(const char *config_path, const char *etcd_host, const char *etcd_prefix,
|
||||
int use_rdma, const char *rdma_device, int rdma_port_num, int rdma_gid_index, int rdma_mtu, int log_level);
|
||||
vitastor_c *vitastor_c_create_uring_json(const char **options, int options_len);
|
||||
|
@@ -3,6 +3,7 @@
|
||||
. `dirname $0`/common.sh
|
||||
|
||||
OSD_SIZE=${OSD_SIZE:-1024}
|
||||
PG_COUNT=${PG_COUNT:-1}
|
||||
|
||||
dd if=/dev/zero of=./testdata/test_osd1.bin bs=1024 count=1 seek=$((OSD_SIZE*1024-1))
|
||||
dd if=/dev/zero of=./testdata/test_osd2.bin bs=1024 count=1 seek=$((OSD_SIZE*1024-1))
|
||||
@@ -25,16 +26,16 @@ if [ -n "$GLOBAL_CONF" ]; then
|
||||
$ETCDCTL put /vitastor/config/global "$GLOBAL_CONF"
|
||||
fi
|
||||
|
||||
$ETCDCTL put /vitastor/config/pools '{"1":{"name":"testpool","scheme":"xor","pg_size":3,"pg_minsize":2,"parity_chunks":1,"pg_count":1,"failure_domain":"osd"}}'
|
||||
$ETCDCTL put /vitastor/config/pools '{"1":{"name":"testpool","scheme":"xor","pg_size":3,"pg_minsize":2,"parity_chunks":1,"pg_count":'$PG_COUNT',"failure_domain":"osd"}}'
|
||||
|
||||
sleep 2
|
||||
|
||||
if ! ($ETCDCTL get /vitastor/config/pgs --print-value-only | jq -s -e '(. | length) != 0 and (.[0].items["1"]["1"].osd_set | sort) == ["1","2","3"]'); then
|
||||
format_error "FAILED: 1 PG NOT CONFIGURED"
|
||||
if ! ($ETCDCTL get /vitastor/config/pgs --print-value-only | jq -s -e '(. | length) != 0 and ([ .[0].items["1"][] | select((.osd_set | sort) == ["1","2","3"]) ] | length) == '$PG_COUNT); then
|
||||
format_error "FAILED: $PG_COUNT PG(s) NOT CONFIGURED"
|
||||
fi
|
||||
|
||||
if ! ($ETCDCTL get /vitastor/pg/state/1/1 --print-value-only | jq -s -e '(. | length) != 0 and .[0].state == ["active"]'); then
|
||||
format_error "FAILED: 1 PG NOT UP"
|
||||
if ! ($ETCDCTL get /vitastor/pg/state/1/ --prefix --print-value-only | jq -s -e '[ .[] | select(.state == ["active"]) ] | length == '$PG_COUNT); then
|
||||
format_error "FAILED: $PG_COUNT PG(s) NOT UP"
|
||||
fi
|
||||
|
||||
if ! cmp build/src/block-vitastor.so /usr/lib/x86_64-linux-gnu/qemu/block-vitastor.so; then
|
||||
|
14
tests/test_rm.sh
Executable file
14
tests/test_rm.sh
Executable file
@@ -0,0 +1,14 @@
|
||||
#!/bin/bash -ex
|
||||
|
||||
PG_COUNT=16
|
||||
. `dirname $0`/run_3osds.sh
|
||||
|
||||
LD_PRELOAD=libasan.so.5 \
|
||||
fio -thread -name=test -ioengine=build/src/libfio_vitastor.so -bs=4M -direct=1 -iodepth=1 \
|
||||
-end_fsync=1 -fsync=1 -rw=write -etcd=$ETCD_URL -pool=1 -inode=1 -size=128M -cluster_log_level=10
|
||||
|
||||
$ETCDCTL get --prefix '/vitastor/pg/state'
|
||||
|
||||
build/src/vitastor-cli rm-data --etcd_address $ETCD_URL --pool 1 --inode 1
|
||||
|
||||
format_green OK
|
@@ -6,18 +6,19 @@
|
||||
|
||||
$ETCDCTL put /vitastor/config/inode/1/2 '{"name":"testimg","size":'$((32*1024*1024))'}'
|
||||
|
||||
LD_PRELOAD=libasan.so.5 \
|
||||
LD_PRELOAD="libasan.so.5 build/src/libfio_vitastor.so" \
|
||||
fio -thread -name=test -ioengine=build/src/libfio_vitastor.so -bs=4M -direct=1 -iodepth=1 -fsync=1 -rw=write \
|
||||
-etcd=$ETCD_URL -pool=1 -inode=2 -size=32M -cluster_log_level=10
|
||||
|
||||
$ETCDCTL put /vitastor/config/inode/1/2 '{"name":"testimg@0","size":'$((32*1024*1024))'}'
|
||||
$ETCDCTL put /vitastor/config/inode/1/3 '{"parent_id":2,"name":"testimg","size":'$((32*1024*1024))'}'
|
||||
|
||||
LD_PRELOAD=libasan.so.5 \
|
||||
# Preload build/src/libfio_vitastor.so so libasan detects all symbols
|
||||
LD_PRELOAD="libasan.so.5 build/src/libfio_vitastor.so" \
|
||||
fio -thread -name=test -ioengine=build/src/libfio_vitastor.so -bs=4k -direct=1 -iodepth=1 -fsync=32 -buffer_pattern=0xdeadface \
|
||||
-rw=randwrite -etcd=$ETCD_URL -image=testimg -number_ios=1024
|
||||
|
||||
LD_PRELOAD=libasan.so.5 \
|
||||
LD_PRELOAD="libasan.so.5 build/src/libfio_vitastor.so" \
|
||||
fio -thread -name=test -ioengine=build/src/libfio_vitastor.so -bs=4M -direct=1 -iodepth=1 -rw=read -etcd=$ETCD_URL -pool=1 -inode=3 -size=32M
|
||||
|
||||
qemu-img convert -S 4096 -p \
|
||||
@@ -38,4 +39,16 @@ node mon/merge.js ./testdata/layer0.bin ./testdata/layer1.bin ./testdata/check.b
|
||||
|
||||
cmp ./testdata/merged.bin ./testdata/check.bin
|
||||
|
||||
# Test merge
|
||||
|
||||
$ETCDCTL put /vitastor/config/inode/1/3 '{"parent_id":2,"name":"testimg","size":'$((32*1024*1024))'}'
|
||||
|
||||
build/src/vitastor-cli rm --etcd_address $ETCD_URL testimg@0
|
||||
|
||||
qemu-img convert -S 4096 -p \
|
||||
-f raw "vitastor:etcd_host=127.0.0.1\:$ETCD_PORT/v3:image=testimg" \
|
||||
-O raw ./testdata/merged-by-tool.bin
|
||||
|
||||
cmp ./testdata/merged.bin ./testdata/merged-by-tool.bin
|
||||
|
||||
format_green OK
|
||||
|
Reference in New Issue
Block a user