Compare commits
140 Commits
Author | SHA1 | Date | |
---|---|---|---|
14a5bcf493 | |||
6aaded0c8f | |||
3cd869df75 | |||
ae725de14e | |||
8dd5534527 | |||
3501ef87e4 | |||
99d4efbdc0 | |||
f623f7a9a1 | |||
7be9ed93d5 | |||
b019988e2b | |||
0db5400cb7 | |||
5a13db107b | |||
1288cfb0af | |||
2eba37db8a | |||
f9e0b0db27 | |||
f6acd5e79c | |||
6fbeb5c668 | |||
8e55869b71 | |||
7bb9004435 | |||
d9d484e8bb | |||
0c0ab64155 | |||
84fca8abca | |||
eaf0fe66a1 | |||
5ca25c0e7d | |||
1b7f2eac8e | |||
ddd16e8613 | |||
046a9f7a67 | |||
bd55b24827 | |||
09d69f7968 | |||
cc4d170ef0 | |||
335b73a3d5 | |||
fc6c5a853e | |||
445393dfc4 | |||
021762193b | |||
a6dee28f4e | |||
6e0ad777e3 | |||
13d069cf5f | |||
504246b1db | |||
fd647021eb | |||
049a7b260e | |||
17d0da1a74 | |||
fe277127bb | |||
79f3147d0c | |||
5126e67c3f | |||
90b1bdee43 | |||
5d501d0d43 | |||
157a62628a | |||
d4da42bb05 | |||
2883df733a | |||
77e3870f8f | |||
b82be8136a | |||
6c4f407575 | |||
c0489d237b | |||
7c250f165c | |||
b46dbbbefb | |||
1810fbe622 | |||
4d6c9bc294 | |||
e7df9683f1 | |||
a5dd943fcc | |||
c9b527f2e2 | |||
1551a49454 | |||
d46feccd03 | |||
959e2e2df9 | |||
b7bc3d652d | |||
fc2762d60a | |||
85b3c691e9 | |||
0ac4645a9e | |||
87922bc660 | |||
77b97b0613 | |||
0713120315 | |||
bce082a444 | |||
383305da88 | |||
639c809827 | |||
71d78c1409 | |||
3e2e2f9846 | |||
cb48c70083 | |||
989571bb74 | |||
0e1d069ad7 | |||
a36b4e5933 | |||
c809d86846 | |||
0bd5eb1f20 | |||
2c8ddc5431 | |||
e1f3829bb1 | |||
a980d65f78 | |||
bcc93e548e | |||
68e9f71723 | |||
1ae4b9a799 | |||
79d4b57f0e | |||
cc73e42488 | |||
6cb0fdb571 | |||
8e1ea15f58 | |||
d7f1b3a2dd | |||
0049a6ed4a | |||
05ed9a27a4 | |||
300a149513 | |||
0223016ce6 | |||
9368cc7d9b | |||
74743ccd3f | |||
ec1c7e6be4 | |||
bc643b24cf | |||
2a66cc3f11 | |||
bd74ce4b70 | |||
7c07303d12 | |||
ac6bacc46e | |||
f7fbfb8174 | |||
ebf85a7515 | |||
36413d89c3 | |||
fb9505d5db | |||
77e1badfad | |||
b83359fdbc | |||
2f4f46d7eb | |||
65a4aecb8c | |||
d08a2fb8ee | |||
5739d52600 | |||
d44dba43b3 | |||
8984556689 | |||
457b47d311 | |||
af62f6374b | |||
1c7f3710be | |||
a67a09da7c | |||
4ed29c19c4 | |||
dd3f64cf62 | |||
e1f4fcb76a | |||
2d24b4d70d | |||
d9e2705db7 | |||
5935762730 | |||
a945a46d56 | |||
82ac6416d3 | |||
df4661230e | |||
72a29b7031 | |||
2d42f29385 | |||
17240c6144 | |||
9e627a4414 | |||
90b1019636 | |||
df604afbd5 | |||
47c7aa62de | |||
9f2dc48d0f | |||
6d951b21fb | |||
552f28cb3e | |||
e87b6e26f7 |
@@ -810,6 +810,24 @@ jobs:
|
||||
echo ""
|
||||
done
|
||||
|
||||
test_reweight_half:
|
||||
runs-on: ubuntu-latest
|
||||
needs: build
|
||||
container: ${{env.TEST_IMAGE}}:${{github.sha}}
|
||||
steps:
|
||||
- name: Run test
|
||||
id: test
|
||||
timeout-minutes: 3
|
||||
run: /root/vitastor/tests/test_reweight_half.sh
|
||||
- name: Print logs
|
||||
if: always() && steps.test.outcome == 'failure'
|
||||
run: |
|
||||
for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
|
||||
echo "-------- $i --------"
|
||||
cat $i
|
||||
echo ""
|
||||
done
|
||||
|
||||
test_heal_csum_32k_dmj:
|
||||
runs-on: ubuntu-latest
|
||||
needs: build
|
||||
|
@@ -4,4 +4,17 @@ project(vitastor)
|
||||
|
||||
set(VITASTOR_VERSION "2.3.0")
|
||||
|
||||
include(CTest)
|
||||
|
||||
add_custom_target(build_tests)
|
||||
add_custom_target(test
|
||||
COMMAND
|
||||
echo leak:tcmalloc > ${CMAKE_CURRENT_BINARY_DIR}/lsan-suppress.txt &&
|
||||
env LSAN_OPTIONS=suppressions=${CMAKE_CURRENT_BINARY_DIR}/lsan-suppress.txt ${CMAKE_CTEST_COMMAND}
|
||||
)
|
||||
# make -j16 -C ../../build test_heap && ../../build/src/test/test_heap
|
||||
# make -j16 -C ../../build test_heap && rm -f $(find ../../build -name '*.gcda') && ctest -V -T test -T coverage -R heap --test-dir ../../build && (cd ../../build; gcovr -f ../src --html --html-nested -o coverage/index.html; cd ../src/test)
|
||||
# make -j16 -C ../../build test_blockstore && rm -f $(find ../../build -name '*.gcda') && ctest -V -T test -T coverage -R blockstore --test-dir ../../build && (cd ../../build; gcovr -f ../src --html --html-nested -o coverage/index.html; cd ../src/test)
|
||||
# kcov --include-path=../../../src ../../kcov ./test_blockstore
|
||||
add_dependencies(test build_tests)
|
||||
add_subdirectory(src)
|
||||
|
@@ -36,7 +36,7 @@ RUN (echo deb http://vitastor.io/debian bookworm main > /etc/apt/sources.list.d/
|
||||
((echo 'Package: *'; echo 'Pin: origin "vitastor.io"'; echo 'Pin-Priority: 1000') > /etc/apt/preferences.d/vitastor.pref) && \
|
||||
wget -q -O /etc/apt/trusted.gpg.d/vitastor.gpg https://vitastor.io/debian/pubkey.gpg && \
|
||||
apt-get update && \
|
||||
apt-get install -y vitastor-client && \
|
||||
apt-get install -y vitastor-client ibverbs-providers && \
|
||||
wget https://vitastor.io/archive/qemu/qemu-bookworm-9.2.2%2Bds-1%2Bvitastor4/qemu-utils_9.2.2%2Bds-1%2Bvitastor4_amd64.deb && \
|
||||
wget https://vitastor.io/archive/qemu/qemu-bookworm-9.2.2%2Bds-1%2Bvitastor4/qemu-block-extra_9.2.2%2Bds-1%2Bvitastor4_amd64.deb && \
|
||||
dpkg -x qemu-utils*.deb tmp1 && \
|
||||
|
2
debian/control
vendored
2
debian/control
vendored
@@ -4,7 +4,7 @@ Priority: optional
|
||||
Maintainer: Vitaliy Filippov <vitalif@yourcmc.ru>
|
||||
Build-Depends: debhelper, g++ (>= 8), libstdc++6 (>= 8),
|
||||
linux-libc-dev, libgoogle-perftools-dev, libjerasure-dev, libgf-complete-dev,
|
||||
libibverbs-dev, libisal-dev, cmake, pkg-config, libnl-3-dev, libnl-genl-3-dev,
|
||||
libibverbs-dev, librdmacm-dev, libisal-dev, cmake, pkg-config, libnl-3-dev, libnl-genl-3-dev,
|
||||
node-bindings <!nocheck>, node-gyp, node-nan
|
||||
Standards-Version: 4.5.0
|
||||
Homepage: https://vitastor.io/
|
||||
|
@@ -3,7 +3,7 @@
|
||||
FROM debian:bookworm
|
||||
|
||||
ADD etc/apt /etc/apt/
|
||||
RUN apt-get update && apt-get -y install vitastor udev systemd qemu-system-x86 qemu-system-common qemu-block-extra qemu-utils jq nfs-common && apt-get clean
|
||||
RUN apt-get update && apt-get -y install vitastor ibverbs-providers udev systemd qemu-system-x86 qemu-system-common qemu-block-extra qemu-utils jq nfs-common && apt-get clean
|
||||
ADD sleep.sh /usr/bin/
|
||||
ADD install.sh /usr/bin/
|
||||
ADD scripts /opt/scripts/
|
||||
|
@@ -491,7 +491,7 @@ Can be used to slow down scrubbing if it affects user load too much.
|
||||
## scrub_list_limit
|
||||
|
||||
- Type: integer
|
||||
- Default: 1000
|
||||
- Default: 262144
|
||||
- Can be changed online: yes
|
||||
|
||||
Number of objects to list in one listing operation during scrub.
|
||||
|
@@ -514,7 +514,7 @@ fsync небезопасным даже с режимом "directsync".
|
||||
## scrub_list_limit
|
||||
|
||||
- Тип: целое число
|
||||
- Значение по умолчанию: 1000
|
||||
- Значение по умолчанию: 262144
|
||||
- Можно менять на лету: да
|
||||
|
||||
Размер загружаемых за одну операцию списков объектов в процессе фоновой
|
||||
|
@@ -566,7 +566,7 @@
|
||||
сильно влияет на пользовательскую нагрузку.
|
||||
- name: scrub_list_limit
|
||||
type: int
|
||||
default: 1000
|
||||
default: 262144
|
||||
online: true
|
||||
info: |
|
||||
Number of objects to list in one listing operation during scrub.
|
||||
|
@@ -73,6 +73,8 @@ Options (automatic mode):
|
||||
--max_other 10%
|
||||
Use disks for OSD data even if they already have non-Vitastor partitions,
|
||||
but only if these take up no more than this percent of disk space.
|
||||
--dry-run
|
||||
Check and print new OSD count for each disk but do not actually create them.
|
||||
```
|
||||
|
||||
Options (single-device mode):
|
||||
|
@@ -74,6 +74,8 @@ vitastor-disk - инструмент командной строки для уп
|
||||
--max_other 10%
|
||||
Использовать диски под данные OSD, даже если на них уже есть не-Vitastor-овые
|
||||
разделы, но только в случае, если они занимают не более данного процента диска.
|
||||
--dry-run
|
||||
Проверить и вывести число новых OSD для каждого диска, но не создавать их.
|
||||
```
|
||||
|
||||
Опции для режима одного OSD:
|
||||
|
@@ -15,7 +15,7 @@ function get_osd_tree(global_config, state)
|
||||
const stat = state.osd.stats[osd_num];
|
||||
const osd_cfg = state.config.osd[osd_num];
|
||||
let reweight = osd_cfg == null ? 1 : Number(osd_cfg.reweight);
|
||||
if (isNaN(reweight) || reweight < 0 || reweight > 0)
|
||||
if (isNaN(reweight) || reweight < 0 || reweight > 1)
|
||||
reweight = 1;
|
||||
if (stat && stat.size && reweight && (state.osd.state[osd_num] || Number(stat.time) >= down_time ||
|
||||
osd_cfg && osd_cfg.noout))
|
||||
|
@@ -499,4 +499,55 @@ sub rename_volume
|
||||
return "${storeid}:${base_name}${target_volname}";
|
||||
}
|
||||
|
||||
sub _monkey_patch_qemu_blockdev_options
|
||||
{
|
||||
my ($cfg, $volid, $machine_version, $options) = @_;
|
||||
my ($storeid, $volname) = PVE::Storage::parse_volume_id($volid);
|
||||
|
||||
my $scfg = PVE::Storage::storage_config($cfg, $storeid);
|
||||
|
||||
my $plugin = PVE::Storage::Plugin->lookup($scfg->{type});
|
||||
|
||||
my ($vtype) = $plugin->parse_volname($volname);
|
||||
die "cannot use volume of type '$vtype' as a QEMU blockdevice\n"
|
||||
if $vtype ne 'images' && $vtype ne 'iso' && $vtype ne 'import';
|
||||
|
||||
return $plugin->qemu_blockdev_options($scfg, $storeid, $volname, $machine_version, $options);
|
||||
}
|
||||
|
||||
sub qemu_blockdev_options
|
||||
{
|
||||
my ($class, $scfg, $storeid, $volname, $machine_version, $options) = @_;
|
||||
my $prefix = defined $scfg->{vitastor_prefix} ? $scfg->{vitastor_prefix} : 'pve/';
|
||||
my ($vtype, $name, $vmid) = $class->parse_volname($volname);
|
||||
$name .= '@'.$options->{'snapshot-name'} if $options->{'snapshot-name'};
|
||||
if ($scfg->{vitastor_nbd})
|
||||
{
|
||||
my $mapped = run_cli($scfg, [ 'ls' ], binary => '/usr/bin/vitastor-nbd');
|
||||
my ($kerneldev) = grep { $mapped->{$_}->{image} eq $prefix.$name } keys %$mapped;
|
||||
die "Image not mapped via NBD" if !$kerneldev;
|
||||
return { driver => 'host_device', filename => $kerneldev };
|
||||
}
|
||||
my $blockdev = {
|
||||
driver => 'vitastor',
|
||||
image => $prefix.$name,
|
||||
};
|
||||
if ($scfg->{vitastor_config_path})
|
||||
{
|
||||
$blockdev->{'config-path'} = $scfg->{vitastor_config_path};
|
||||
}
|
||||
if ($scfg->{vitastor_etcd_address})
|
||||
{
|
||||
# FIXME This is the only exception: etcd_address -> etcd_host for qemu
|
||||
$blockdev->{'etcd-host'} = $scfg->{vitastor_etcd_address};
|
||||
}
|
||||
if ($scfg->{vitastor_etcd_prefix})
|
||||
{
|
||||
$blockdev->{'etcd-prefix'} = $scfg->{vitastor_etcd_prefix};
|
||||
}
|
||||
return $blockdev;
|
||||
}
|
||||
|
||||
*PVE::Storage::qemu_blockdev_options = *_monkey_patch_qemu_blockdev_options;
|
||||
|
||||
1;
|
||||
|
@@ -19,6 +19,7 @@ if("${CMAKE_INSTALL_PREFIX}" MATCHES "^/usr/local/?$")
|
||||
endif()
|
||||
set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}")
|
||||
endif()
|
||||
set(ENABLE_COVERAGE false CACHE BOOL "Enable code coverage")
|
||||
|
||||
add_definitions(-DVITASTOR_VERSION="2.3.0")
|
||||
add_definitions(-D_GNU_SOURCE -D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64 -Wall -Wno-sign-compare -Wno-comment -Wno-parentheses -Wno-pointer-arith -fdiagnostics-color=always -fno-omit-frame-pointer -fvisibility=hidden -I ${CMAKE_SOURCE_DIR}/src)
|
||||
@@ -31,6 +32,11 @@ set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -fvisibility-inlines-hid
|
||||
set(CMAKE_CXX_FLAGS_MINSIZEREL "${CMAKE_CXX_FLAGS_MINSIZEREL} -fvisibility-inlines-hidden")
|
||||
set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} -fvisibility-inlines-hidden")
|
||||
|
||||
if (${ENABLE_COVERAGE})
|
||||
add_definitions(-coverage)
|
||||
add_link_options(-coverage)
|
||||
endif()
|
||||
|
||||
set(CMAKE_BUILD_TYPE RelWithDebInfo)
|
||||
string(REGEX REPLACE "([\\/\\-]O)[^ \t\r\n]*" "\\13" CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE}")
|
||||
string(REGEX REPLACE "([\\/\\-]O)[^ \t\r\n]*" "\\13" CMAKE_CXX_FLAGS_MINSIZEREL "${CMAKE_CXX_FLAGS_MINSIZEREL}")
|
||||
@@ -78,14 +84,6 @@ else()
|
||||
set(LIBURING_LIBRARIES uring)
|
||||
endif (${WITH_SYSTEM_LIBURING})
|
||||
|
||||
add_custom_target(build_tests)
|
||||
add_custom_target(test
|
||||
COMMAND
|
||||
echo leak:tcmalloc > ${CMAKE_CURRENT_BINARY_DIR}/lsan-suppress.txt &&
|
||||
env LSAN_OPTIONS=suppressions=${CMAKE_CURRENT_BINARY_DIR}/lsan-suppress.txt ${CMAKE_CTEST_COMMAND}
|
||||
)
|
||||
add_dependencies(test build_tests)
|
||||
|
||||
include_directories(
|
||||
../
|
||||
${CMAKE_SOURCE_DIR}/src/blockstore
|
||||
|
@@ -2,15 +2,17 @@ cmake_minimum_required(VERSION 2.8.12)
|
||||
|
||||
project(vitastor)
|
||||
|
||||
# libvitastor_blk.so
|
||||
add_library(vitastor_blk SHARED
|
||||
../util/allocator.cpp blockstore.cpp blockstore_impl.cpp blockstore_disk.cpp blockstore_init.cpp blockstore_open.cpp blockstore_journal.cpp blockstore_read.cpp
|
||||
blockstore_write.cpp blockstore_sync.cpp blockstore_stable.cpp blockstore_rollback.cpp blockstore_flush.cpp ../util/crc32c.c ../util/ringloop.cpp
|
||||
# libvitastor_blk.a
|
||||
add_library(vitastor_blk STATIC
|
||||
../util/allocator.cpp ../util/crc32c.c ../util/ringloop.cpp
|
||||
multilist.cpp blockstore_heap.cpp blockstore_disk.cpp
|
||||
blockstore.cpp blockstore_impl.cpp blockstore_init.cpp blockstore_open.cpp
|
||||
blockstore_flush.cpp blockstore_read.cpp blockstore_stable.cpp blockstore_sync.cpp blockstore_write.cpp
|
||||
)
|
||||
target_compile_options(vitastor_blk PUBLIC -fPIC)
|
||||
target_link_libraries(vitastor_blk
|
||||
${LIBURING_LIBRARIES}
|
||||
${ISAL_LIBRARIES}
|
||||
tcmalloc_minimal
|
||||
# for timerfd_manager
|
||||
vitastor_common
|
||||
)
|
||||
|
@@ -3,7 +3,7 @@
|
||||
|
||||
#include "blockstore_impl.h"
|
||||
|
||||
blockstore_t::blockstore_t(blockstore_config_t & config, ring_loop_t *ringloop, timerfd_manager_t *tfd)
|
||||
blockstore_t::blockstore_t(blockstore_config_t & config, ring_loop_i *ringloop, timerfd_manager_t *tfd)
|
||||
{
|
||||
impl = new blockstore_impl_t(config, ringloop, tfd);
|
||||
}
|
||||
@@ -48,9 +48,9 @@ int blockstore_t::read_bitmap(object_id oid, uint64_t target_version, void *bitm
|
||||
return impl->read_bitmap(oid, target_version, bitmap, result_version);
|
||||
}
|
||||
|
||||
std::map<uint64_t, uint64_t> & blockstore_t::get_inode_space_stats()
|
||||
const std::map<uint64_t, uint64_t> & blockstore_t::get_inode_space_stats()
|
||||
{
|
||||
return impl->inode_space_stats;
|
||||
return impl->get_inode_space_stats();
|
||||
}
|
||||
|
||||
void blockstore_t::dump_diagnostics()
|
||||
@@ -82,8 +82,3 @@ uint32_t blockstore_t::get_bitmap_granularity()
|
||||
{
|
||||
return impl->get_bitmap_granularity();
|
||||
}
|
||||
|
||||
void blockstore_t::set_no_inode_stats(const std::vector<uint64_t> & pool_ids)
|
||||
{
|
||||
impl->set_no_inode_stats(pool_ids);
|
||||
}
|
||||
|
@@ -22,17 +22,20 @@
|
||||
#define DIRECT_IO_ALIGNMENT 512
|
||||
#endif
|
||||
|
||||
// Memory allocation alignment (page size is usually optimal)
|
||||
#ifndef MEM_ALIGNMENT
|
||||
#define MEM_ALIGNMENT 4096
|
||||
#endif
|
||||
|
||||
// Default block size is 128 KB, current allowed range is 4K - 128M
|
||||
#define DEFAULT_DATA_BLOCK_ORDER 17
|
||||
#define MIN_DATA_BLOCK_SIZE 4*1024
|
||||
#define MAX_DATA_BLOCK_SIZE 128*1024*1024
|
||||
#define DEFAULT_BITMAP_GRANULARITY 4096
|
||||
|
||||
#define MIN_JOURNAL_SIZE 1024*1024
|
||||
|
||||
// "VITAstor"
|
||||
#define BLOCKSTORE_META_MAGIC_V1 0x726F747341544956l
|
||||
#define BLOCKSTORE_META_FORMAT_V1 1
|
||||
#define BLOCKSTORE_META_FORMAT_V2 2
|
||||
#define BLOCKSTORE_META_FORMAT_HEAP 3
|
||||
|
||||
#define BS_OP_MIN 1
|
||||
#define BS_OP_READ 1
|
||||
#define BS_OP_WRITE 2
|
||||
@@ -48,6 +51,12 @@
|
||||
|
||||
/*
|
||||
|
||||
All operations may be submitted in any order, because reads only see completed writes,
|
||||
syncs only sync completed writes and writes don't depend on each other.
|
||||
|
||||
The only restriction is that the external code MUST NOT submit multiple writes for one
|
||||
object in parallel. This is a natural restriction because `version` numbers are used though.
|
||||
|
||||
Blockstore opcode documentation:
|
||||
|
||||
## BS_OP_READ / BS_OP_WRITE / BS_OP_WRITE_STABLE
|
||||
@@ -162,8 +171,8 @@ struct __attribute__ ((visibility("default"))) blockstore_op_t
|
||||
uint32_t list_stable_limit;
|
||||
};
|
||||
};
|
||||
void *buf = NULL;
|
||||
void *bitmap = NULL;
|
||||
uint8_t *buf = NULL;
|
||||
uint8_t *bitmap = NULL;
|
||||
int retval = 0;
|
||||
|
||||
uint8_t private_data[BS_OP_PRIVATE_DATA_SIZE];
|
||||
@@ -177,7 +186,7 @@ class __attribute__((visibility("default"))) blockstore_t
|
||||
{
|
||||
blockstore_impl_t *impl;
|
||||
public:
|
||||
blockstore_t(blockstore_config_t & config, ring_loop_t *ringloop, timerfd_manager_t *tfd);
|
||||
blockstore_t(blockstore_config_t & config, ring_loop_i *ringloop, timerfd_manager_t *tfd);
|
||||
~blockstore_t();
|
||||
|
||||
// Update configuration
|
||||
@@ -205,10 +214,7 @@ public:
|
||||
int read_bitmap(object_id oid, uint64_t target_version, void *bitmap, uint64_t *result_version = NULL);
|
||||
|
||||
// Get per-inode space usage statistics
|
||||
std::map<uint64_t, uint64_t> & get_inode_space_stats();
|
||||
|
||||
// Set per-pool no_inode_stats
|
||||
void set_no_inode_stats(const std::vector<uint64_t> & pool_ids);
|
||||
const std::map<uint64_t, uint64_t> & get_inode_space_stats();
|
||||
|
||||
// Print diagnostics to stdout
|
||||
void dump_diagnostics();
|
||||
|
@@ -2,11 +2,14 @@
|
||||
// License: VNPL-1.1 (see README.md for details)
|
||||
|
||||
#include <sys/file.h>
|
||||
#include <sys/ioctl.h>
|
||||
#include <unistd.h>
|
||||
|
||||
#include <stdexcept>
|
||||
|
||||
#include "blockstore_impl.h"
|
||||
#include "blockstore.h"
|
||||
#include "blockstore_disk.h"
|
||||
#include "blockstore_heap.h"
|
||||
#include "str_util.h"
|
||||
#include "allocator.h"
|
||||
|
||||
@@ -44,8 +47,11 @@ void blockstore_disk_t::parse_config(std::map<std::string, std::string> & config
|
||||
disk_alignment = parse_size(config["disk_alignment"]);
|
||||
journal_block_size = parse_size(config["journal_block_size"]);
|
||||
meta_block_size = parse_size(config["meta_block_size"]);
|
||||
meta_block_target_free_space = parse_size(config["meta_block_target_free_space"]);
|
||||
bitmap_granularity = parse_size(config["bitmap_granularity"]);
|
||||
meta_format = stoull_full(config["meta_format"]);
|
||||
atomic_write_size = (config.find("atomic_write_size") != config.end()
|
||||
? parse_size(config["atomic_write_size"]) : 4096);
|
||||
if (config.find("data_io") == config.end() &&
|
||||
config.find("meta_io") == config.end() &&
|
||||
config.find("journal_io") == config.end())
|
||||
@@ -90,12 +96,28 @@ void blockstore_disk_t::parse_config(std::map<std::string, std::string> & config
|
||||
if (!min_discard_size)
|
||||
min_discard_size = 1024*1024;
|
||||
discard_granularity = parse_size(config["discard_granularity"]);
|
||||
inmemory_meta = config["inmemory_metadata"] != "false" && config["inmemory_metadata"] != "0" &&
|
||||
config["inmemory_metadata"] != "no";
|
||||
inmemory_journal = config["inmemory_journal"] != "false" && config["inmemory_journal"] != "0" &&
|
||||
config["inmemory_journal"] != "no";
|
||||
disable_data_fsync = config["disable_data_fsync"] == "true" || config["disable_data_fsync"] == "1" || config["disable_data_fsync"] == "yes";
|
||||
disable_meta_fsync = config["disable_meta_fsync"] == "true" || config["disable_meta_fsync"] == "1" || config["disable_meta_fsync"] == "yes";
|
||||
disable_journal_fsync = config["disable_journal_fsync"] == "true" || config["disable_journal_fsync"] == "1" || config["disable_journal_fsync"] == "yes";
|
||||
if (mock_mode)
|
||||
{
|
||||
data_device_size = parse_size(config["data_device_size"]);
|
||||
data_device_sect = parse_size(config["data_device_sect"]);
|
||||
meta_device_size = parse_size(config["meta_device_size"]);
|
||||
meta_device_sect = parse_size(config["meta_device_sect"]);
|
||||
journal_device_size = parse_size(config["journal_device_size"]);
|
||||
journal_device_sect = parse_size(config["journal_device_sect"]);
|
||||
}
|
||||
// Validate
|
||||
if (!data_block_size)
|
||||
{
|
||||
data_block_size = (1 << DEFAULT_DATA_BLOCK_ORDER);
|
||||
}
|
||||
if ((block_order = is_power_of_two(data_block_size)) >= 64 || data_block_size < MIN_DATA_BLOCK_SIZE || data_block_size >= MAX_DATA_BLOCK_SIZE)
|
||||
if (is_power_of_two(data_block_size) >= 64 || data_block_size < MIN_DATA_BLOCK_SIZE || data_block_size >= MAX_DATA_BLOCK_SIZE)
|
||||
{
|
||||
throw std::runtime_error("Bad block size");
|
||||
}
|
||||
@@ -131,6 +153,14 @@ void blockstore_disk_t::parse_config(std::map<std::string, std::string> & config
|
||||
{
|
||||
throw std::runtime_error("meta_block_size must not exceed "+std::to_string(MAX_DATA_BLOCK_SIZE));
|
||||
}
|
||||
if (!meta_block_target_free_space)
|
||||
{
|
||||
meta_block_target_free_space = 800;
|
||||
}
|
||||
if (meta_block_target_free_space >= meta_block_size)
|
||||
{
|
||||
throw std::runtime_error("meta_block_target_free_space must not exceed "+std::to_string(meta_block_size));
|
||||
}
|
||||
if (data_offset % disk_alignment)
|
||||
{
|
||||
throw std::runtime_error("data_offset must be a multiple of disk_alignment = "+std::to_string(disk_alignment));
|
||||
@@ -179,17 +209,29 @@ void blockstore_disk_t::parse_config(std::map<std::string, std::string> & config
|
||||
{
|
||||
throw std::runtime_error("journal_offset must be a multiple of journal_block_size = "+std::to_string(journal_block_size));
|
||||
}
|
||||
if (!meta_format)
|
||||
{
|
||||
meta_format = BLOCKSTORE_META_FORMAT_HEAP;
|
||||
}
|
||||
if (meta_device == data_device)
|
||||
{
|
||||
disable_meta_fsync = disable_data_fsync;
|
||||
}
|
||||
if (journal_device == meta_device)
|
||||
{
|
||||
disable_journal_fsync = disable_meta_fsync;
|
||||
}
|
||||
}
|
||||
|
||||
void blockstore_disk_t::calc_lengths(bool skip_meta_check)
|
||||
void blockstore_disk_t::calc_lengths()
|
||||
{
|
||||
// data
|
||||
data_len = data_device_size - data_offset;
|
||||
if (data_fd == meta_fd && data_offset < meta_offset)
|
||||
if (data_device == meta_device && data_offset < meta_offset)
|
||||
{
|
||||
data_len = meta_offset - data_offset;
|
||||
}
|
||||
if (data_fd == journal_fd && data_offset < journal_offset)
|
||||
if (data_device == journal_device && data_offset < journal_offset)
|
||||
{
|
||||
data_len = data_len < journal_offset-data_offset
|
||||
? data_len : journal_offset-data_offset;
|
||||
@@ -204,23 +246,23 @@ void blockstore_disk_t::calc_lengths(bool skip_meta_check)
|
||||
data_len = cfg_data_size;
|
||||
}
|
||||
// meta
|
||||
uint64_t meta_area_size = (meta_fd == data_fd ? data_device_size : meta_device_size) - meta_offset;
|
||||
if (meta_fd == data_fd && meta_offset <= data_offset)
|
||||
meta_area_size = (meta_device == data_device ? data_device_size : meta_device_size) - meta_offset;
|
||||
if (meta_device == data_device && meta_offset <= data_offset)
|
||||
{
|
||||
meta_area_size = data_offset - meta_offset;
|
||||
}
|
||||
if (meta_fd == journal_fd && meta_offset <= journal_offset)
|
||||
if (meta_device == journal_device && meta_offset <= journal_offset)
|
||||
{
|
||||
meta_area_size = meta_area_size < journal_offset-meta_offset
|
||||
? meta_area_size : journal_offset-meta_offset;
|
||||
}
|
||||
// journal
|
||||
journal_len = (journal_fd == data_fd ? data_device_size : (journal_fd == meta_fd ? meta_device_size : journal_device_size)) - journal_offset;
|
||||
if (journal_fd == data_fd && journal_offset <= data_offset)
|
||||
journal_len = (journal_device == data_device ? data_device_size : (journal_device == meta_device ? meta_device_size : journal_device_size)) - journal_offset;
|
||||
if (journal_device == data_device && journal_offset <= data_offset)
|
||||
{
|
||||
journal_len = data_offset - journal_offset;
|
||||
}
|
||||
if (journal_fd == meta_fd && journal_offset <= meta_offset)
|
||||
if (journal_device == meta_device && journal_offset <= meta_offset)
|
||||
{
|
||||
journal_len = journal_len < meta_offset-journal_offset
|
||||
? journal_len : meta_offset-journal_offset;
|
||||
@@ -230,37 +272,37 @@ void blockstore_disk_t::calc_lengths(bool skip_meta_check)
|
||||
clean_entry_bitmap_size = data_block_size / bitmap_granularity / 8;
|
||||
clean_dyn_size = clean_entry_bitmap_size*2 + (csum_block_size
|
||||
? data_block_size/csum_block_size*(data_csum_type & 0xFF) : 0);
|
||||
clean_entry_size = sizeof(clean_disk_entry) + clean_dyn_size + 4 /*entry_csum*/;
|
||||
meta_len = (1 + (block_count - 1 + meta_block_size / clean_entry_size) / (meta_block_size / clean_entry_size)) * meta_block_size;
|
||||
bool new_doesnt_fit = (!meta_format && !skip_meta_check && meta_area_size < meta_len && !data_csum_type);
|
||||
if (meta_format == BLOCKSTORE_META_FORMAT_V1 || new_doesnt_fit)
|
||||
if (meta_format == BLOCKSTORE_META_FORMAT_HEAP)
|
||||
{
|
||||
uint64_t clean_entry_v0_size = sizeof(clean_disk_entry) + 2*clean_entry_bitmap_size;
|
||||
uint64_t meta_v0_len = (1 + (block_count - 1 + meta_block_size / clean_entry_v0_size)
|
||||
/ (meta_block_size / clean_entry_v0_size)) * meta_block_size;
|
||||
if (meta_format == BLOCKSTORE_META_FORMAT_V1 || meta_area_size >= meta_v0_len)
|
||||
{
|
||||
// Old metadata fits.
|
||||
if (new_doesnt_fit)
|
||||
{
|
||||
printf("Warning: Using old metadata format without checksums because the new format"
|
||||
" doesn't fit into provided area (%ju bytes required, %ju bytes available)\n", meta_len, meta_area_size);
|
||||
}
|
||||
clean_entry_size = clean_entry_v0_size;
|
||||
meta_len = meta_v0_len;
|
||||
meta_format = BLOCKSTORE_META_FORMAT_V1;
|
||||
}
|
||||
else
|
||||
meta_format = BLOCKSTORE_META_FORMAT_V2;
|
||||
uint32_t entries_per_block = ((meta_block_size-meta_block_target_free_space) /
|
||||
(sizeof(heap_object_t) + sizeof(heap_write_t) + clean_dyn_size));
|
||||
min_meta_len = (block_count+entries_per_block-1) / entries_per_block * meta_block_size;
|
||||
}
|
||||
else if (meta_format == BLOCKSTORE_META_FORMAT_V1)
|
||||
{
|
||||
clean_entry_size = 24 /*sizeof(clean_disk_entry)*/ + 2*clean_entry_bitmap_size;
|
||||
min_meta_len = (1 + (block_count - 1 + meta_block_size / clean_entry_size)
|
||||
/ (meta_block_size / clean_entry_size)) * meta_block_size;
|
||||
}
|
||||
else if (meta_format == BLOCKSTORE_META_FORMAT_V2)
|
||||
{
|
||||
clean_entry_size = 24 /*sizeof(clean_disk_entry)*/ + clean_dyn_size + 4 /*entry_csum*/;
|
||||
min_meta_len = (1 + (block_count - 1 + meta_block_size / clean_entry_size) / (meta_block_size / clean_entry_size)) * meta_block_size;
|
||||
}
|
||||
else
|
||||
meta_format = BLOCKSTORE_META_FORMAT_V2;
|
||||
if (!skip_meta_check && meta_area_size < meta_len)
|
||||
{
|
||||
throw std::runtime_error("Metadata area is too small, need at least "+std::to_string(meta_len)+" bytes, have only "+std::to_string(meta_area_size)+" bytes");
|
||||
throw std::runtime_error("meta_format = "+std::to_string(meta_format)+" is not supported");
|
||||
}
|
||||
}
|
||||
|
||||
void blockstore_disk_t::check_lengths()
|
||||
{
|
||||
if (meta_area_size < min_meta_len)
|
||||
{
|
||||
throw std::runtime_error("Metadata area is too small, need at least "+std::to_string(min_meta_len)+" bytes, have only "+std::to_string(meta_area_size)+" bytes");
|
||||
}
|
||||
// requested journal size
|
||||
if (!skip_meta_check && cfg_journal_size > journal_len)
|
||||
if (cfg_journal_size > journal_len)
|
||||
{
|
||||
throw std::runtime_error("Requested journal_size is too large");
|
||||
}
|
||||
@@ -321,12 +363,19 @@ static int bs_openmode(const std::string & mode)
|
||||
|
||||
void blockstore_disk_t::open_data()
|
||||
{
|
||||
data_fd = open(data_device.c_str(), bs_openmode(data_io) | O_RDWR);
|
||||
if (data_fd >= 0)
|
||||
{
|
||||
throw std::runtime_error("data device is already opened");
|
||||
}
|
||||
data_fd = mock_mode ? MOCK_DATA_FD : open(data_device.c_str(), bs_openmode(data_io) | O_RDWR);
|
||||
if (data_fd == -1)
|
||||
{
|
||||
throw std::runtime_error("Failed to open data device "+data_device+": "+std::string(strerror(errno)));
|
||||
}
|
||||
check_size(data_fd, &data_device_size, &data_device_sect, "data device");
|
||||
if (!mock_mode)
|
||||
{
|
||||
check_size(data_fd, &data_device_size, &data_device_sect, "data device");
|
||||
}
|
||||
if (disk_alignment % data_device_sect)
|
||||
{
|
||||
throw std::runtime_error(
|
||||
@@ -338,7 +387,7 @@ void blockstore_disk_t::open_data()
|
||||
{
|
||||
throw std::runtime_error("data_offset exceeds device size = "+std::to_string(data_device_size));
|
||||
}
|
||||
if (!disable_flock && flock(data_fd, LOCK_EX|LOCK_NB) != 0)
|
||||
if (!mock_mode && !disable_flock && flock(data_fd, LOCK_EX|LOCK_NB) != 0)
|
||||
{
|
||||
throw std::runtime_error(std::string("Failed to lock data device: ") + strerror(errno));
|
||||
}
|
||||
@@ -346,19 +395,26 @@ void blockstore_disk_t::open_data()
|
||||
|
||||
void blockstore_disk_t::open_meta()
|
||||
{
|
||||
if (meta_fd >= 0)
|
||||
{
|
||||
throw std::runtime_error("metadata device is already opened");
|
||||
}
|
||||
if (meta_device != data_device || meta_io != data_io)
|
||||
{
|
||||
meta_fd = open(meta_device.c_str(), bs_openmode(meta_io) | O_RDWR);
|
||||
meta_fd = mock_mode ? MOCK_META_FD : open(meta_device.c_str(), bs_openmode(meta_io) | O_RDWR);
|
||||
if (meta_fd == -1)
|
||||
{
|
||||
throw std::runtime_error("Failed to open metadata device "+meta_device+": "+std::string(strerror(errno)));
|
||||
}
|
||||
check_size(meta_fd, &meta_device_size, &meta_device_sect, "metadata device");
|
||||
if (!mock_mode)
|
||||
{
|
||||
check_size(meta_fd, &meta_device_size, &meta_device_sect, "metadata device");
|
||||
}
|
||||
if (meta_offset >= meta_device_size)
|
||||
{
|
||||
throw std::runtime_error("meta_offset exceeds device size = "+std::to_string(meta_device_size));
|
||||
}
|
||||
if (!disable_flock && meta_device != data_device && flock(meta_fd, LOCK_EX|LOCK_NB) != 0)
|
||||
if (!mock_mode && !disable_flock && meta_device != data_device && flock(meta_fd, LOCK_EX|LOCK_NB) != 0)
|
||||
{
|
||||
throw std::runtime_error(std::string("Failed to lock metadata device: ") + strerror(errno));
|
||||
}
|
||||
@@ -384,15 +440,26 @@ void blockstore_disk_t::open_meta()
|
||||
|
||||
void blockstore_disk_t::open_journal()
|
||||
{
|
||||
if (journal_fd >= 0)
|
||||
{
|
||||
throw std::runtime_error("journal device is already opened");
|
||||
}
|
||||
if (journal_device != meta_device || journal_io != meta_io)
|
||||
{
|
||||
journal_fd = open(journal_device.c_str(), bs_openmode(journal_io) | O_RDWR);
|
||||
journal_fd = mock_mode ? MOCK_JOURNAL_FD : open(journal_device.c_str(), bs_openmode(journal_io) | O_RDWR);
|
||||
if (journal_fd == -1)
|
||||
{
|
||||
throw std::runtime_error("Failed to open journal device "+journal_device+": "+std::string(strerror(errno)));
|
||||
}
|
||||
check_size(journal_fd, &journal_device_size, &journal_device_sect, "journal device");
|
||||
if (!disable_flock && journal_device != meta_device && flock(journal_fd, LOCK_EX|LOCK_NB) != 0)
|
||||
if (!mock_mode)
|
||||
{
|
||||
check_size(journal_fd, &journal_device_size, &journal_device_sect, "journal device");
|
||||
}
|
||||
if (journal_offset >= journal_device_size)
|
||||
{
|
||||
throw std::runtime_error("journal_offset exceeds device size = "+std::to_string(journal_device_size));
|
||||
}
|
||||
if (!mock_mode && !disable_flock && journal_device != meta_device && flock(journal_fd, LOCK_EX|LOCK_NB) != 0)
|
||||
{
|
||||
throw std::runtime_error(std::string("Failed to lock journal device: ") + strerror(errno));
|
||||
}
|
||||
@@ -418,25 +485,32 @@ void blockstore_disk_t::open_journal()
|
||||
|
||||
void blockstore_disk_t::close_all()
|
||||
{
|
||||
if (data_fd >= 0)
|
||||
close(data_fd);
|
||||
if (meta_fd >= 0 && meta_fd != data_fd)
|
||||
close(meta_fd);
|
||||
if (journal_fd >= 0 && journal_fd != meta_fd)
|
||||
close(journal_fd);
|
||||
if (!mock_mode)
|
||||
{
|
||||
if (data_fd >= 0)
|
||||
close(data_fd);
|
||||
if (meta_fd >= 0 && meta_fd != data_fd)
|
||||
close(meta_fd);
|
||||
if (journal_fd >= 0 && journal_fd != meta_fd)
|
||||
close(journal_fd);
|
||||
}
|
||||
data_fd = meta_fd = journal_fd = -1;
|
||||
}
|
||||
|
||||
// Sadly DISCARD only works through ioctl(), but it seems to always block the device queue,
|
||||
// so it's not a big deal that we can only run it synchronously.
|
||||
int blockstore_disk_t::trim_data(allocator_t *alloc)
|
||||
int blockstore_disk_t::trim_data(std::function<bool(uint64_t)> is_free)
|
||||
{
|
||||
if (mock_mode)
|
||||
{
|
||||
return -EINVAL;
|
||||
}
|
||||
int r = 0;
|
||||
uint64_t j = 0, i = 0;
|
||||
uint64_t discarded = 0;
|
||||
for (; i <= block_count; i++)
|
||||
{
|
||||
if (i >= block_count || alloc->get(i))
|
||||
if (i >= block_count || is_free(i))
|
||||
{
|
||||
if (i > j && (i-j)*data_block_size >= min_discard_size)
|
||||
{
|
||||
|
@@ -12,6 +12,10 @@
|
||||
// Lower byte of checksum type is its length
|
||||
#define BLOCKSTORE_CSUM_CRC32C 0x104
|
||||
|
||||
#define MOCK_DATA_FD 1000
|
||||
#define MOCK_META_FD 1001
|
||||
#define MOCK_JOURNAL_FD 1002
|
||||
|
||||
class allocator_t;
|
||||
|
||||
struct blockstore_disk_t
|
||||
@@ -22,11 +26,15 @@ struct blockstore_disk_t
|
||||
// Required write alignment and journal/metadata/data areas' location alignment
|
||||
uint32_t disk_alignment = 4096;
|
||||
// Journal block size - minimum_io_size of the journal device is the best choice
|
||||
uint64_t journal_block_size = 4096;
|
||||
uint32_t journal_block_size = 4096;
|
||||
// Metadata block size - minimum_io_size of the metadata device is the best choice
|
||||
uint64_t meta_block_size = 4096;
|
||||
uint32_t meta_block_size = 4096;
|
||||
// Atomic write size of the data block device
|
||||
uint32_t atomic_write_size = 4096;
|
||||
// Target free space in metadata blocks
|
||||
uint32_t meta_block_target_free_space = 800;
|
||||
// Sparse write tracking granularity. 4 KB is a good choice. Must be a multiple of disk_alignment
|
||||
uint64_t bitmap_granularity = 4096;
|
||||
uint32_t bitmap_granularity = 4096;
|
||||
// Data checksum type, BLOCKSTORE_CSUM_NONE or BLOCKSTORE_CSUM_CRC32C
|
||||
uint32_t data_csum_type = BLOCKSTORE_CSUM_NONE;
|
||||
// Checksum block size, must be a multiple of bitmap_granularity
|
||||
@@ -36,27 +44,36 @@ struct blockstore_disk_t
|
||||
// I/O modes for data, metadata and journal: direct or "" = O_DIRECT, cached = O_SYNC, directsync = O_DIRECT|O_SYNC
|
||||
// O_SYNC without O_DIRECT = use Linux page cache for reads and writes
|
||||
std::string data_io, meta_io, journal_io;
|
||||
// It is safe to disable fsync() if drive write cache is writethrough
|
||||
bool disable_data_fsync = false, disable_meta_fsync = false, disable_journal_fsync = false;
|
||||
// Keep journal (buffered data) in memory?
|
||||
bool inmemory_meta = true;
|
||||
// Keep metadata in memory?
|
||||
bool inmemory_journal = true;
|
||||
// Data discard granularity and minimum size (for the sake of performance)
|
||||
bool discard_on_start = false;
|
||||
uint64_t min_discard_size = 1024*1024;
|
||||
uint64_t discard_granularity = 0;
|
||||
|
||||
int meta_fd = -1, data_fd = -1, journal_fd = -1;
|
||||
uint64_t meta_offset, meta_device_sect, meta_device_size, meta_len, meta_format = 0;
|
||||
uint64_t meta_offset, meta_device_sect, meta_device_size, meta_area_size, min_meta_len, meta_format = 0;
|
||||
uint64_t data_offset, data_device_sect, data_device_size, data_len;
|
||||
uint64_t journal_offset, journal_device_sect, journal_device_size, journal_len;
|
||||
|
||||
uint32_t block_order = 0;
|
||||
uint64_t block_count = 0;
|
||||
uint32_t clean_entry_bitmap_size = 0, clean_entry_size = 0, clean_dyn_size = 0;
|
||||
uint32_t clean_entry_bitmap_size = 0;
|
||||
uint32_t clean_entry_size = 0, clean_dyn_size = 0; // for meta_v1/2
|
||||
|
||||
bool mock_mode = false;
|
||||
|
||||
void parse_config(std::map<std::string, std::string> & config);
|
||||
void open_data();
|
||||
void open_meta();
|
||||
void open_journal();
|
||||
void calc_lengths(bool skip_meta_check = false);
|
||||
void calc_lengths();
|
||||
void check_lengths();
|
||||
void close_all();
|
||||
int trim_data(allocator_t *alloc);
|
||||
int trim_data(std::function<bool(uint64_t)> is_free);
|
||||
|
||||
inline uint64_t dirty_dyn_size(uint64_t offset, uint64_t len)
|
||||
{
|
||||
|
File diff suppressed because it is too large
Load Diff
@@ -1,22 +1,20 @@
|
||||
// Copyright (c) Vitaliy Filippov, 2019+
|
||||
// License: VNPL-1.1 (see README.md for details)
|
||||
|
||||
#define COPY_BUF_JOURNAL 1
|
||||
#define COPY_BUF_DATA 2
|
||||
#define COPY_BUF_ZERO 4
|
||||
#define COPY_BUF_CSUM_FILL 8
|
||||
#define COPY_BUF_COALESCED 16
|
||||
#define COPY_BUF_META_BLOCK 32
|
||||
#define COPY_BUF_JOURNALED_BIG 64
|
||||
#define COPY_BUF_JOURNAL 0x01
|
||||
#define COPY_BUF_DATA 0x02
|
||||
#define COPY_BUF_ZERO 0x04
|
||||
#define COPY_BUF_CSUM_FILL 0x08
|
||||
#define COPY_BUF_COALESCED 0x10
|
||||
#define COPY_BUF_PADDED 0x20
|
||||
#define COPY_BUF_SKIP_CSUM 0x40
|
||||
|
||||
struct copy_buffer_t
|
||||
{
|
||||
int copy_flags;
|
||||
uint64_t offset, len, disk_offset;
|
||||
uint64_t journal_sector; // only for reads: sector+1 if used and !journal.inmemory, otherwise 0
|
||||
void *buf;
|
||||
uint8_t *csum_buf;
|
||||
int *dyn_data;
|
||||
uint32_t copy_flags;
|
||||
uint64_t offset, len, disk_loc, disk_offset, disk_len;
|
||||
uint8_t *buf;
|
||||
uint64_t wr_lsn;
|
||||
};
|
||||
|
||||
struct meta_sector_t
|
||||
@@ -27,13 +25,6 @@ struct meta_sector_t
|
||||
int usage_count;
|
||||
};
|
||||
|
||||
struct flusher_sync_t
|
||||
{
|
||||
bool fsync_meta;
|
||||
int ready_count;
|
||||
int state;
|
||||
};
|
||||
|
||||
struct flusher_meta_write_t
|
||||
{
|
||||
uint64_t sector, pos;
|
||||
@@ -49,94 +40,75 @@ class journal_flusher_co
|
||||
{
|
||||
blockstore_impl_t *bs;
|
||||
journal_flusher_t *flusher;
|
||||
int wait_state, wait_count, wait_journal_count;
|
||||
int co_id;
|
||||
int wait_state, wait_count;
|
||||
struct io_uring_sqe *sqe;
|
||||
struct ring_data_t *data;
|
||||
|
||||
std::list<flusher_sync_t>::iterator cur_sync;
|
||||
std::function<void(ring_data_t*)> simple_callback_r, simple_callback_w;
|
||||
|
||||
obj_ver_id cur;
|
||||
std::map<obj_ver_id, dirty_entry>::iterator dirty_it, dirty_start, dirty_end;
|
||||
std::map<object_id, uint64_t>::iterator repeat_it;
|
||||
std::function<void(ring_data_t*)> simple_callback_r, simple_callback_rj, simple_callback_w;
|
||||
object_id cur_oid;
|
||||
uint64_t copy_id;
|
||||
uint64_t compact_lsn;
|
||||
uint64_t cur_version;
|
||||
heap_object_t *cur_obj;
|
||||
heap_write_t *begin_wr, *end_wr;
|
||||
uint32_t modified_block;
|
||||
bool should_repeat;
|
||||
|
||||
bool try_trim = false;
|
||||
bool skip_copy, has_delete, has_writes;
|
||||
std::vector<copy_buffer_t> v;
|
||||
std::vector<copy_buffer_t>::iterator it;
|
||||
int i;
|
||||
bool fill_incomplete, cleared_incomplete;
|
||||
int read_to_fill_incomplete;
|
||||
std::vector<copy_buffer_t> read_vec;
|
||||
uint32_t overwrite_start, overwrite_end;
|
||||
uint32_t big_start, big_end;
|
||||
int i, res;
|
||||
bool read_to_fill_incomplete;
|
||||
int copy_count;
|
||||
uint64_t clean_loc, clean_ver, old_clean_loc, old_clean_ver;
|
||||
uint64_t clean_loc;
|
||||
flusher_meta_write_t meta_old, meta_new;
|
||||
bool clean_init_bitmap;
|
||||
uint64_t clean_bitmap_offset, clean_bitmap_len;
|
||||
uint8_t *clean_init_dyn_ptr;
|
||||
uint8_t *new_clean_bitmap;
|
||||
|
||||
uint64_t new_trim_pos;
|
||||
bool do_repeat = false;
|
||||
|
||||
friend class journal_flusher_t;
|
||||
void scan_dirty();
|
||||
bool read_dirty(int wait_base);
|
||||
bool modify_meta_do_reads(int wait_base);
|
||||
bool wait_meta_reads(int wait_base);
|
||||
bool modify_meta_read(uint64_t meta_loc, flusher_meta_write_t &wr, int wait_base);
|
||||
bool clear_incomplete_csum_block_bits(int wait_base);
|
||||
void calc_block_checksums(uint32_t *new_data_csums, bool skip_overwrites);
|
||||
void update_metadata_entry();
|
||||
bool write_meta_block(flusher_meta_write_t & meta_block, int wait_base);
|
||||
void update_clean_db();
|
||||
void free_data_blocks();
|
||||
bool fsync_batch(bool fsync_meta, int wait_base);
|
||||
bool trim_journal(int wait_base);
|
||||
|
||||
void iterate_checksum_holes(std::function<void(int & pos, uint32_t hole_start, uint32_t hole_end)> cb);
|
||||
void fill_partial_checksum_blocks();
|
||||
void free_buffers();
|
||||
int check_and_punch_checksums();
|
||||
bool calc_block_checksums();
|
||||
bool write_meta_block(int wait_base);
|
||||
bool read_buffered(int wait_base);
|
||||
bool fsync_meta(int wait_base);
|
||||
int fsync_buffer(int wait_base);
|
||||
bool trim_lsn(int wait_base);
|
||||
public:
|
||||
journal_flusher_co();
|
||||
~journal_flusher_co();
|
||||
bool loop();
|
||||
};
|
||||
|
||||
// Journal flusher itself
|
||||
class journal_flusher_t
|
||||
{
|
||||
int trim_wanted = 0;
|
||||
bool dequeuing;
|
||||
int min_flusher_count, max_flusher_count, cur_flusher_count, target_flusher_count;
|
||||
int flusher_start_threshold;
|
||||
int force_start = 0;
|
||||
int min_flusher_count = 0, max_flusher_count = 0, cur_flusher_count = 0, target_flusher_count = 0;
|
||||
journal_flusher_co *co;
|
||||
blockstore_impl_t *bs;
|
||||
friend class journal_flusher_co;
|
||||
|
||||
int journal_trim_counter;
|
||||
bool trimming;
|
||||
void* journal_superblock;
|
||||
int advance_lsn_counter = 0;
|
||||
uint64_t compact_counter = 0;
|
||||
|
||||
int active_flushers;
|
||||
int syncing_flushers;
|
||||
std::list<flusher_sync_t> syncs;
|
||||
std::map<object_id, uint64_t> sync_to_repeat;
|
||||
|
||||
std::map<uint64_t, meta_sector_t> meta_sectors;
|
||||
std::deque<object_id> flush_queue;
|
||||
std::unordered_map<object_id, uint64_t> flush_versions;
|
||||
std::unordered_set<uint64_t> inflight_meta_sectors;
|
||||
|
||||
bool try_find_older(std::map<obj_ver_id, dirty_entry>::iterator & dirty_end, obj_ver_id & cur);
|
||||
bool try_find_other(std::map<obj_ver_id, dirty_entry>::iterator & dirty_end, obj_ver_id & cur);
|
||||
int active_flushers = 0;
|
||||
int wanting_meta_fsync = 0;
|
||||
bool fsyncing_meta = false;
|
||||
int syncing_buffer = 0;
|
||||
|
||||
public:
|
||||
journal_flusher_t(blockstore_impl_t *bs);
|
||||
~journal_flusher_t();
|
||||
void loop();
|
||||
bool is_trim_wanted() { return trim_wanted; }
|
||||
int get_syncing_buffer();
|
||||
uint64_t get_compact_counter();
|
||||
bool is_active();
|
||||
void mark_trim_possible();
|
||||
void request_trim();
|
||||
void release_trim();
|
||||
void enqueue_flush(obj_ver_id oid);
|
||||
void unshift_flush(obj_ver_id oid, bool force);
|
||||
void remove_flush(object_id oid);
|
||||
void dump_diagnostics();
|
||||
bool is_mutated(uint64_t clean_loc);
|
||||
};
|
||||
|
2395
src/blockstore/blockstore_heap.cpp
Normal file
2395
src/blockstore/blockstore_heap.cpp
Normal file
File diff suppressed because it is too large
Load Diff
376
src/blockstore/blockstore_heap.h
Normal file
376
src/blockstore/blockstore_heap.h
Normal file
@@ -0,0 +1,376 @@
|
||||
// Metadata storage version 3 ("heap")
|
||||
// Copyright (c) Vitaliy Filippov, 2025+
|
||||
// License: VNPL-1.1 (see README.md for details)
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <map>
|
||||
#include <unordered_map>
|
||||
#include <set>
|
||||
#include <deque>
|
||||
#include <vector>
|
||||
|
||||
#include "../client/object_id.h"
|
||||
#include "../util/robin_hood.h"
|
||||
#include "blockstore_disk.h"
|
||||
#include "multilist.h"
|
||||
|
||||
struct pool_shard_settings_t
|
||||
{
|
||||
uint32_t pg_count;
|
||||
uint32_t pg_stripe_size;
|
||||
};
|
||||
|
||||
#define BS_HEAP_TYPE 7
|
||||
#define BS_HEAP_OBJECT 1
|
||||
#define BS_HEAP_SMALL_WRITE 2
|
||||
#define BS_HEAP_BIG_WRITE 3
|
||||
#define BS_HEAP_TOMBSTONE 4
|
||||
#define BS_HEAP_INTENT_WRITE 5
|
||||
#define BS_HEAP_STABLE 8
|
||||
|
||||
class blockstore_heap_t;
|
||||
|
||||
struct __attribute__((__packed__)) heap_small_write_t
|
||||
{
|
||||
uint16_t size;
|
||||
int16_t next_pos;
|
||||
uint8_t flags;
|
||||
uint64_t lsn;
|
||||
uint64_t version;
|
||||
uint64_t location;
|
||||
uint32_t offset;
|
||||
uint32_t len;
|
||||
};
|
||||
|
||||
struct __attribute__((__packed__)) heap_big_write_t
|
||||
{
|
||||
uint16_t size;
|
||||
int16_t next_pos;
|
||||
uint8_t flags;
|
||||
uint64_t lsn;
|
||||
uint64_t version;
|
||||
uint32_t block_num;
|
||||
};
|
||||
|
||||
struct __attribute__((__packed__)) heap_tombstone_t
|
||||
{
|
||||
uint16_t size;
|
||||
int16_t next_pos;
|
||||
uint8_t flags;
|
||||
uint64_t lsn;
|
||||
uint64_t version;
|
||||
};
|
||||
|
||||
struct __attribute__((__packed__)) heap_write_t
|
||||
{
|
||||
// size should have top bit cleared
|
||||
uint16_t size = 0;
|
||||
int16_t next_pos = 0;
|
||||
uint8_t entry_type = 0; // BS_HEAP_*
|
||||
uint64_t lsn = 0;
|
||||
uint64_t version = 0;
|
||||
|
||||
// uint8_t[] external_bitmap
|
||||
// uint8_t[] internal_bitmap
|
||||
// uint32_t[] checksums
|
||||
|
||||
heap_write_t *next();
|
||||
inline uint8_t type() const { return (entry_type & BS_HEAP_TYPE); }
|
||||
inline heap_small_write_t& small() { return *(heap_small_write_t*)this; }
|
||||
inline heap_big_write_t& big() { return *(heap_big_write_t*)this; }
|
||||
uint32_t get_size(blockstore_heap_t *heap);
|
||||
uint32_t get_csum_size(blockstore_heap_t *heap);
|
||||
bool needs_recheck(blockstore_heap_t *heap);
|
||||
bool needs_compact(blockstore_heap_t *heap);
|
||||
bool is_compacted(uint64_t compacted_lsn);
|
||||
bool can_be_collapsed(blockstore_heap_t *heap);
|
||||
bool is_allowed_before_compacted(uint64_t compacted_lsn, bool is_last_entry);
|
||||
uint8_t *get_ext_bitmap(blockstore_heap_t *heap);
|
||||
uint8_t *get_int_bitmap(blockstore_heap_t *heap);
|
||||
uint8_t *get_checksums(blockstore_heap_t *heap);
|
||||
uint32_t *get_checksum(blockstore_heap_t *heap);
|
||||
uint64_t big_location(blockstore_heap_t *heap);
|
||||
void set_big_location(blockstore_heap_t *heap, uint64_t location);
|
||||
};
|
||||
|
||||
struct __attribute__((__packed__)) heap_object_t
|
||||
{
|
||||
// size should have top bit cleared
|
||||
uint16_t size = 0;
|
||||
// linked list of write entries...
|
||||
// newest entries are stored first to simplify scanning
|
||||
int16_t write_pos = 0;
|
||||
uint8_t entry_type = 0; // BS_HEAP_*
|
||||
uint32_t crc32c = 0;
|
||||
uint64_t inode = 0;
|
||||
uint64_t stripe = 0;
|
||||
|
||||
heap_write_t *get_writes();
|
||||
uint32_t calc_crc32c();
|
||||
};
|
||||
|
||||
struct heap_object_lsn_t
|
||||
{
|
||||
object_id oid;
|
||||
uint64_t lsn;
|
||||
};
|
||||
|
||||
inline bool operator < (const heap_object_lsn_t & a, const heap_object_lsn_t & b)
|
||||
{
|
||||
return a.oid < b.oid || a.oid == b.oid && a.lsn < b.lsn;
|
||||
}
|
||||
|
||||
struct tmp_compact_item_t
|
||||
{
|
||||
object_id oid;
|
||||
uint64_t lsn;
|
||||
bool compact;
|
||||
};
|
||||
|
||||
struct heap_mvcc_copy_id_t
|
||||
{
|
||||
object_id oid;
|
||||
uint64_t copy_id;
|
||||
};
|
||||
|
||||
inline bool operator == (const heap_mvcc_copy_id_t & a, const heap_mvcc_copy_id_t & b)
|
||||
{
|
||||
return a.oid.inode == b.oid.inode && a.oid.stripe == b.oid.stripe && a.copy_id == b.copy_id;
|
||||
}
|
||||
|
||||
namespace std
|
||||
{
|
||||
template<> struct hash<heap_mvcc_copy_id_t>
|
||||
{
|
||||
inline size_t operator()(const heap_mvcc_copy_id_t &s) const
|
||||
{
|
||||
size_t seed = std::hash<object_id>()(s.oid);
|
||||
// Copy-pasted from spp::hash_combine()
|
||||
seed ^= (s.copy_id + 0xc6a4a7935bd1e995 + (seed << 6) + (seed >> 2));
|
||||
return seed;
|
||||
}
|
||||
};
|
||||
};
|
||||
|
||||
struct heap_object_mvcc_t
|
||||
{
|
||||
uint32_t readers = 0;
|
||||
heap_object_t *entry_copy = NULL;
|
||||
};
|
||||
|
||||
struct __attribute__((__packed__)) heap_block_info_t
|
||||
{
|
||||
uint32_t used_space = 0;
|
||||
uint32_t free_pos = 0;
|
||||
uint8_t *data = NULL;
|
||||
};
|
||||
|
||||
struct heap_inflight_lsn_t
|
||||
{
|
||||
object_id oid;
|
||||
uint64_t flags;
|
||||
};
|
||||
|
||||
struct heap_refqi_t
|
||||
{
|
||||
uint64_t lsn;
|
||||
uint64_t inode;
|
||||
uint64_t location;
|
||||
uint32_t len;
|
||||
bool is_data;
|
||||
};
|
||||
|
||||
using i64hash_t = robin_hood::hash<uint64_t>;
|
||||
using heap_block_index_t = robin_hood::unordered_flat_map<uint64_t,
|
||||
robin_hood::unordered_flat_map<inode_t, robin_hood::unordered_flat_map<uint64_t, uint64_t, i64hash_t, std::equal_to<uint64_t>, 88>, i64hash_t>, i64hash_t>;
|
||||
using heap_mvcc_map_t = robin_hood::unordered_flat_map<heap_mvcc_copy_id_t, heap_object_mvcc_t>;
|
||||
|
||||
class blockstore_heap_t
|
||||
{
|
||||
friend class heap_write_t;
|
||||
friend class heap_object_t;
|
||||
|
||||
blockstore_disk_t *dsk = NULL;
|
||||
uint8_t* buffer_area = NULL;
|
||||
bool abort_on_corruption = false;
|
||||
bool abort_on_overlap = true;
|
||||
int log_level = 0;
|
||||
|
||||
const uint32_t meta_block_count = 0;
|
||||
uint32_t target_block_free_space = 800;
|
||||
|
||||
uint64_t next_lsn = 0;
|
||||
robin_hood::unordered_flat_map<pool_id_t, pool_shard_settings_t> pool_shard_settings;
|
||||
// PG => inode => stripe => block number
|
||||
heap_block_index_t block_index;
|
||||
std::vector<heap_block_info_t> block_info;
|
||||
allocator_t *data_alloc = NULL;
|
||||
multilist_index_t *meta_alloc = NULL;
|
||||
uint32_t meta_alloc_count = 0;
|
||||
uint64_t meta_used_space = 0;
|
||||
multilist_alloc_t *buffer_alloc = NULL;
|
||||
heap_mvcc_map_t object_mvcc;
|
||||
std::unordered_map<uint64_t, uint32_t> mvcc_data_refs;
|
||||
std::unordered_map<uint64_t, uint32_t> mvcc_buffer_refs;
|
||||
std::map<uint64_t, uint64_t> inode_space_stats;
|
||||
uint64_t buffer_area_used_space = 0;
|
||||
uint64_t data_used_space = 0;
|
||||
|
||||
// LSN queue: inflight (writing) -> completed [-> fsynced] -> compactable -> compacted [-> fsynced] -> trimmed and removed
|
||||
std::deque<heap_inflight_lsn_t> inflight_lsn;
|
||||
uint32_t to_compact_count = 0;
|
||||
uint64_t first_inflight_lsn = 0;
|
||||
uint64_t completed_lsn = 0;
|
||||
uint64_t fsynced_lsn = 0;
|
||||
uint64_t compacted_lsn = 0;
|
||||
uint64_t next_compact_lsn = 0;
|
||||
std::deque<heap_refqi_t> overwrite_ref_queue;
|
||||
|
||||
std::vector<tmp_compact_item_t> tmp_compact_queue;
|
||||
std::deque<object_id> recheck_queue;
|
||||
int recheck_in_progress = 0;
|
||||
bool in_recheck = false;
|
||||
std::function<void(bool is_data, uint64_t offset, uint64_t len, uint8_t* buf, std::function<void()>)> recheck_cb;
|
||||
int recheck_queue_depth = 0;
|
||||
|
||||
const uint32_t max_write_entry_size;
|
||||
|
||||
uint64_t get_pg_id(inode_t inode, uint64_t stripe);
|
||||
void defragment_block(uint32_t block_num);
|
||||
uint32_t find_block_run(heap_block_info_t & block, uint32_t space);
|
||||
uint32_t find_block_space(uint32_t block_num, uint32_t space);
|
||||
uint32_t block_has_compactable(uint8_t *data);
|
||||
uint32_t compact_object_to(heap_object_t *obj, uint64_t lsn, uint8_t *new_csums, bool do_free);
|
||||
void copy_full_object(uint8_t *dst, heap_object_t *obj);
|
||||
bool mvcc_save_copy(heap_object_t *obj);
|
||||
bool mvcc_check_tracking(object_id oid);
|
||||
void free_mvcc(heap_mvcc_map_t::iterator mvcc_it);
|
||||
void allocate_block(heap_block_info_t & inf);
|
||||
int allocate_new_object(object_id oid, uint32_t full_object_size, uint32_t *modified_block, heap_object_t **new_obj);
|
||||
int add_object(object_id oid, heap_write_t *wr, uint32_t *modified_block);
|
||||
void mark_overwritten(uint64_t over_lsn, uint64_t inode, heap_write_t *wr, heap_write_t *end_wr, bool tracking_active);
|
||||
int update_object(uint32_t block_num, heap_object_t *obj, heap_write_t *wr, uint32_t *modified_block, uint32_t *moved_from_block);
|
||||
void init_erase(uint32_t block_num, heap_object_t *obj);
|
||||
void erase_object(uint32_t block_num, heap_object_t *obj, uint64_t lsn, bool tracking_active);
|
||||
void reindex_block(uint32_t block_num, heap_object_t *from_obj);
|
||||
void erase_block_index(inode_t inode, uint64_t stripe);
|
||||
void deref_data(uint64_t inode, uint64_t location, bool free_at_0);
|
||||
void deref_buffer(uint64_t inode, uint64_t location, uint32_t len, bool free_at_0);
|
||||
void deref_overwrites(uint64_t lsn);
|
||||
void free_object_space(inode_t inode, heap_write_t *from, heap_write_t *to, int mode = 0);
|
||||
void add_used_space(uint32_t block_num, int32_t used_delta);
|
||||
void push_inflight_lsn(object_id oid, uint64_t lsn, uint64_t flags);
|
||||
|
||||
public:
|
||||
blockstore_heap_t(blockstore_disk_t *dsk, uint8_t *buffer_area, int log_level = 0);
|
||||
~blockstore_heap_t();
|
||||
// set initially compacted lsn - should be done before loading
|
||||
void set_compacted_lsn(uint64_t compacted_lsn);
|
||||
uint64_t get_compacted_lsn();
|
||||
// load data from the disk, returns count of loaded write entries
|
||||
void read_blocks(uint64_t disk_offset, uint64_t size, uint8_t *buf,
|
||||
std::function<void(heap_object_t*)> handle_object, std::function<void(uint32_t, uint32_t, uint8_t*)> handle_block);
|
||||
uint64_t load_blocks(uint64_t disk_offset, uint64_t size, uint8_t *buf);
|
||||
// finish loading
|
||||
void finish_load();
|
||||
// recheck small write data after reading the database from disk
|
||||
bool recheck_small_writes(std::function<void(bool is_data, uint64_t offset, uint64_t len, uint8_t* buf, std::function<void()>)> read_buffer, int queue_depth);
|
||||
// initialize metadata area (fill it with empty data)
|
||||
// returns 0 when done, EAGAIN when the caller has to wait more
|
||||
int initialize();
|
||||
// read from the metadata area
|
||||
// returns 0 when done, EAGAIN when the caller has to wait more
|
||||
int read();
|
||||
// reshard database according to the pool's PG count
|
||||
void reshard(pool_id_t pool, uint32_t pg_count, uint32_t pg_stripe_size);
|
||||
// read an object entry and lock it against removal
|
||||
// in the future, may become asynchronous
|
||||
heap_object_t *lock_and_read_entry(object_id oid, uint64_t & copy_id);
|
||||
// re-read a locked object entry with the given lsn (pointer may be invalidated)
|
||||
heap_object_t *read_locked_entry(object_id oid, uint64_t copy_id);
|
||||
// read an object entry without locking it
|
||||
heap_object_t *read_entry(object_id oid, uint32_t *block_num_ptr, bool for_update = false);
|
||||
// unlock an entry
|
||||
bool unlock_entry(object_id oid, uint64_t copy_id);
|
||||
// set or verify checksums in a write request
|
||||
bool calc_checksums(heap_write_t *wr, uint8_t *data, bool set, uint32_t offset = 0, uint32_t len = 0);
|
||||
// set or verify raw block checksums
|
||||
bool calc_block_checksums(uint32_t *block_csums, uint8_t *data, uint8_t *bitmap, uint32_t start, uint32_t end,
|
||||
bool set, std::function<void(uint32_t, uint32_t, uint32_t)> bad_block_cb);
|
||||
bool calc_block_checksums(uint32_t *block_csums, uint8_t *bitmap,
|
||||
uint32_t start, uint32_t end, std::function<uint8_t*(uint32_t start, uint32_t & len)> next,
|
||||
bool set, std::function<void(uint32_t, uint32_t, uint32_t)> bad_block_cb);
|
||||
// copy an object as is
|
||||
int copy_object(heap_object_t *obj, uint32_t *modified_block);
|
||||
// auto-compacts the object, then adds a write entry to it and to the compaction queue
|
||||
// return 0 if OK, or maybe ENOSPC
|
||||
int post_write(object_id oid, heap_write_t *wr, uint32_t *modified_block, uint32_t *moved_from_block);
|
||||
int post_write(uint32_t & block_num, object_id oid, heap_object_t *obj, heap_write_t *wr, uint32_t *moved_from_block);
|
||||
// stabilize an unstable object version
|
||||
// return 0 if OK, ENOENT if not exists
|
||||
int post_stabilize(object_id oid, uint64_t version, uint32_t *modified_block, uint64_t *new_lsn, uint64_t *new_to_lsn);
|
||||
// rollback an unstable object version
|
||||
// return 0 if OK, ENOENT if not exists, EBUSY if already stable
|
||||
int post_rollback(object_id oid, uint64_t version, uint64_t *new_lsn, uint32_t *modified_block);
|
||||
// forget an object
|
||||
// return error code
|
||||
int post_delete(object_id oid, uint64_t *new_lsn, uint32_t *modified_block);
|
||||
int post_delete(uint32_t block_num, heap_object_t *obj, uint64_t *new_lsn);
|
||||
// get the next object to compact
|
||||
// guaranteed to return objects in min lsn order
|
||||
// returns 0 if OK, ENOENT if nothing to compact
|
||||
int get_next_compact(object_id & oid);
|
||||
// get the range of an object eligible for compaction
|
||||
void get_compact_range(heap_object_t *obj, uint64_t max_lsn, heap_write_t **begin_wr, heap_write_t **end_wr);
|
||||
// mark an object as compacted up to the given lsn
|
||||
int compact_object(object_id oid, uint64_t lsn, uint8_t *new_csums);
|
||||
// retrieve object listing from a PG
|
||||
int list_objects(uint32_t pg_num, object_id min_oid, object_id max_oid,
|
||||
obj_ver_id **result_list, size_t *stable_count, size_t *unstable_count);
|
||||
// set a block number for a new object and returns error status: 0, EAGAIN or ENOSPC
|
||||
int get_block_for_new_object(uint32_t & out_block_num, uint32_t size = 0);
|
||||
|
||||
// inflight write tracking
|
||||
void mark_lsn_completed(uint64_t lsn);
|
||||
void mark_lsn_fsynced(uint64_t lsn);
|
||||
void mark_lsn_compacted(uint64_t lsn, bool allow_undone = false);
|
||||
void mark_object_compacted(heap_object_t *obj, uint64_t max_lsn);
|
||||
void mark_lsn_trimmed(uint64_t lsn);
|
||||
uint64_t get_completed_lsn();
|
||||
uint64_t get_fsynced_lsn();
|
||||
|
||||
// data device block allocator functions
|
||||
uint64_t find_free_data();
|
||||
bool is_data_used(uint64_t location);
|
||||
void use_data(inode_t inode, uint64_t location);
|
||||
void free_data(inode_t inode, uint64_t location);
|
||||
|
||||
// buffer device allocator functions
|
||||
uint64_t find_free_buffer_area(uint64_t size);
|
||||
bool is_buffer_area_free(uint64_t location, uint64_t size);
|
||||
void use_buffer_area(inode_t inode, uint64_t location, uint64_t size);
|
||||
void free_buffer_area(inode_t inode, uint64_t location, uint64_t size);
|
||||
uint64_t get_buffer_area_used_space();
|
||||
|
||||
// get metadata block data buffer and used space
|
||||
uint8_t *get_meta_block(uint32_t block_num);
|
||||
uint32_t get_meta_block_used_space(uint32_t block_num);
|
||||
|
||||
// get space usage statistics
|
||||
uint64_t get_data_used_space();
|
||||
const std::map<uint64_t, uint64_t> & get_inode_space_stats();
|
||||
uint64_t get_meta_total_space();
|
||||
uint64_t get_meta_used_space();
|
||||
uint32_t get_meta_nearfull_blocks();
|
||||
uint32_t get_inflight_queue_size();
|
||||
uint32_t get_compact_queue_size();
|
||||
uint32_t get_to_compact_count();
|
||||
|
||||
// get maximum size for a temporary heap_write_t buffer
|
||||
uint32_t get_max_write_entry_size();
|
||||
|
||||
// only for tests
|
||||
void set_abort_on_corruption(bool fail);
|
||||
void set_abort_on_overlap(bool fail);
|
||||
};
|
@@ -1,13 +1,17 @@
|
||||
// Copyright (c) Vitaliy Filippov, 2019+
|
||||
// License: VNPL-1.1 (see README.md for details)
|
||||
|
||||
#include "blockstore_impl.h"
|
||||
#include <stdexcept>
|
||||
|
||||
blockstore_impl_t::blockstore_impl_t(blockstore_config_t & config, ring_loop_t *ringloop, timerfd_manager_t *tfd)
|
||||
#include "blockstore_impl.h"
|
||||
#include "crc32c.h"
|
||||
|
||||
blockstore_impl_t::blockstore_impl_t(blockstore_config_t & config, ring_loop_i *ringloop, timerfd_manager_t *tfd, bool mock_mode)
|
||||
{
|
||||
assert(sizeof(blockstore_op_private_t) <= BS_OP_PRIVATE_DATA_SIZE);
|
||||
this->tfd = tfd;
|
||||
this->ringloop = ringloop;
|
||||
dsk.mock_mode = mock_mode;
|
||||
ring_consumer.loop = [this]() { loop(); };
|
||||
ringloop->register_consumer(&ring_consumer);
|
||||
initialized = 0;
|
||||
@@ -17,31 +21,43 @@ blockstore_impl_t::blockstore_impl_t(blockstore_config_t & config, ring_loop_t *
|
||||
dsk.open_data();
|
||||
dsk.open_meta();
|
||||
dsk.open_journal();
|
||||
calc_lengths();
|
||||
alloc_dyn_data = dsk.clean_dyn_size > sizeof(void*) || dsk.csum_block_size > 0;
|
||||
dsk.calc_lengths();
|
||||
zero_object = (uint8_t*)memalign_or_die(MEM_ALIGNMENT, dsk.data_block_size);
|
||||
data_alloc = new allocator_t(dsk.block_count);
|
||||
}
|
||||
catch (std::exception & e)
|
||||
{
|
||||
dsk.close_all();
|
||||
throw;
|
||||
}
|
||||
memset(zero_object, 0, dsk.data_block_size);
|
||||
meta_superblock = (uint8_t*)memalign_or_die(MEM_ALIGNMENT, dsk.meta_block_size);
|
||||
memset(meta_superblock, 0, dsk.meta_block_size);
|
||||
}
|
||||
|
||||
void blockstore_impl_t::init()
|
||||
{
|
||||
flusher = new journal_flusher_t(this);
|
||||
if (dsk.inmemory_journal)
|
||||
{
|
||||
buffer_area = (uint8_t*)memalign_or_die(MEM_ALIGNMENT, dsk.journal_len);
|
||||
}
|
||||
heap = new blockstore_heap_t(&dsk, buffer_area, log_level);
|
||||
}
|
||||
|
||||
blockstore_impl_t::~blockstore_impl_t()
|
||||
{
|
||||
delete data_alloc;
|
||||
delete flusher;
|
||||
if (flusher)
|
||||
delete flusher;
|
||||
if (heap)
|
||||
delete heap;
|
||||
if (buffer_area)
|
||||
free(buffer_area);
|
||||
if (meta_superblock)
|
||||
free(meta_superblock);
|
||||
if (zero_object)
|
||||
free(zero_object);
|
||||
ringloop->unregister_consumer(&ring_consumer);
|
||||
dsk.close_all();
|
||||
if (metadata_buffer)
|
||||
free(metadata_buffer);
|
||||
if (clean_bitmaps)
|
||||
free(clean_bitmaps);
|
||||
}
|
||||
|
||||
bool blockstore_impl_t::is_started()
|
||||
@@ -57,10 +73,9 @@ bool blockstore_impl_t::is_stalled()
|
||||
// main event loop - produce requests
|
||||
void blockstore_impl_t::loop()
|
||||
{
|
||||
// FIXME: initialized == 10 is ugly
|
||||
if (initialized != 10)
|
||||
{
|
||||
// read metadata, then journal
|
||||
// read metadata
|
||||
if (initialized == 0)
|
||||
{
|
||||
metadata_init_reader = new blockstore_init_meta(this);
|
||||
@@ -73,69 +88,41 @@ void blockstore_impl_t::loop()
|
||||
{
|
||||
delete metadata_init_reader;
|
||||
metadata_init_reader = NULL;
|
||||
journal_init_reader = new blockstore_init_journal(this);
|
||||
initialized = 2;
|
||||
}
|
||||
}
|
||||
if (initialized == 2)
|
||||
{
|
||||
int res = journal_init_reader->loop();
|
||||
if (!res)
|
||||
{
|
||||
delete journal_init_reader;
|
||||
journal_init_reader = NULL;
|
||||
initialized = 3;
|
||||
ringloop->wakeup();
|
||||
}
|
||||
}
|
||||
if (initialized == 3)
|
||||
{
|
||||
if (!readonly && dsk.discard_on_start)
|
||||
dsk.trim_data(data_alloc);
|
||||
if (journal.flush_journal)
|
||||
initialized = 4;
|
||||
else
|
||||
initialized = 10;
|
||||
}
|
||||
if (initialized == 4)
|
||||
{
|
||||
if (readonly)
|
||||
{
|
||||
printf("Can't flush the journal in readonly mode\n");
|
||||
exit(1);
|
||||
dsk.trim_data([this](uint64_t block_num){ return heap->is_data_used(block_num * dsk.data_block_size); });
|
||||
}
|
||||
flusher->loop();
|
||||
ringloop->submit();
|
||||
initialized = 10;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// try to submit ops
|
||||
unsigned initial_ring_space = ringloop->space_left();
|
||||
// has_writes == 0 - no writes before the current queue item
|
||||
// has_writes == 1 - some writes in progress
|
||||
// has_writes == 2 - tried to submit some writes, but failed
|
||||
int has_writes = 0, op_idx = 0, new_idx = 0;
|
||||
int op_idx = 0, new_idx = 0;
|
||||
bool has_unfinished_writes = false;
|
||||
for (; op_idx < submit_queue.size(); op_idx++, new_idx++)
|
||||
{
|
||||
auto op = submit_queue[op_idx];
|
||||
submit_queue[new_idx] = op;
|
||||
// FIXME: This needs some simplification
|
||||
// Writes should not block reads if the ring is not full and reads don't depend on them
|
||||
// In all other cases we should stop submission
|
||||
if (PRIV(op)->wait_for)
|
||||
{
|
||||
check_wait(op);
|
||||
if (PRIV(op)->wait_for == WAIT_SQE)
|
||||
{
|
||||
// ring is full, stop submission
|
||||
break;
|
||||
}
|
||||
else if (PRIV(op)->wait_for)
|
||||
{
|
||||
if (op->opcode == BS_OP_WRITE || op->opcode == BS_OP_WRITE_STABLE || op->opcode == BS_OP_DELETE)
|
||||
{
|
||||
has_writes = 2;
|
||||
}
|
||||
has_unfinished_writes = has_unfinished_writes || op->opcode == BS_OP_WRITE ||
|
||||
op->opcode == BS_OP_WRITE_STABLE || op->opcode == BS_OP_DELETE ||
|
||||
op->opcode == BS_OP_STABLE || op->opcode == BS_OP_ROLLBACK;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
@@ -148,46 +135,33 @@ void blockstore_impl_t::loop()
|
||||
{
|
||||
wr_st = dequeue_read(op);
|
||||
}
|
||||
else if (op->opcode == BS_OP_WRITE || op->opcode == BS_OP_WRITE_STABLE)
|
||||
else if (op->opcode == BS_OP_WRITE || op->opcode == BS_OP_WRITE_STABLE || op->opcode == BS_OP_DELETE)
|
||||
{
|
||||
if (has_writes == 2)
|
||||
{
|
||||
// Some writes already could not be submitted
|
||||
continue;
|
||||
}
|
||||
wr_st = dequeue_write(op);
|
||||
has_writes = wr_st > 0 ? 1 : 2;
|
||||
}
|
||||
else if (op->opcode == BS_OP_DELETE)
|
||||
{
|
||||
if (has_writes == 2)
|
||||
{
|
||||
// Some writes already could not be submitted
|
||||
continue;
|
||||
}
|
||||
wr_st = dequeue_del(op);
|
||||
has_writes = wr_st > 0 ? 1 : 2;
|
||||
has_unfinished_writes = has_unfinished_writes || (wr_st != 2);
|
||||
}
|
||||
else if (op->opcode == BS_OP_SYNC)
|
||||
{
|
||||
// sync only completed writes?
|
||||
// wait for the data device fsync to complete, then submit journal writes for big writes
|
||||
// then submit an fsync operation
|
||||
// syncs only completed writes, so doesn't have to be blocked by anything
|
||||
wr_st = continue_sync(op);
|
||||
}
|
||||
else if (op->opcode == BS_OP_STABLE)
|
||||
else if (op->opcode == BS_OP_STABLE || op->opcode == BS_OP_ROLLBACK)
|
||||
{
|
||||
wr_st = dequeue_stable(op);
|
||||
}
|
||||
else if (op->opcode == BS_OP_ROLLBACK)
|
||||
{
|
||||
wr_st = dequeue_rollback(op);
|
||||
has_unfinished_writes = has_unfinished_writes || (wr_st != 2);
|
||||
}
|
||||
else if (op->opcode == BS_OP_LIST)
|
||||
{
|
||||
// LIST doesn't have to be blocked by previous modifications
|
||||
process_list(op);
|
||||
wr_st = 2;
|
||||
// LIST has to be blocked by previous writes and commits/rollbacks
|
||||
if (!has_unfinished_writes)
|
||||
{
|
||||
process_list(op);
|
||||
wr_st = 2;
|
||||
}
|
||||
else
|
||||
{
|
||||
wr_st = 0;
|
||||
}
|
||||
}
|
||||
if (wr_st == 2)
|
||||
{
|
||||
@@ -196,16 +170,13 @@ void blockstore_impl_t::loop()
|
||||
}
|
||||
if (wr_st == 0)
|
||||
{
|
||||
PRIV(op)->pending_ops = 0;
|
||||
ringloop->restore(prev_sqe_pos);
|
||||
if (PRIV(op)->wait_for == WAIT_SQE)
|
||||
{
|
||||
// ring is full, stop submission
|
||||
break;
|
||||
}
|
||||
else if (PRIV(op)->wait_for == WAIT_JOURNAL)
|
||||
{
|
||||
PRIV(op)->wait_detail2 = (unstable_writes.size()+unstable_unsynced);
|
||||
}
|
||||
}
|
||||
}
|
||||
if (op_idx != new_idx)
|
||||
@@ -225,12 +196,6 @@ void blockstore_impl_t::loop()
|
||||
{
|
||||
throw std::runtime_error(std::string("io_uring_submit: ") + strerror(-ret));
|
||||
}
|
||||
for (auto s: journal.submitting_sectors)
|
||||
{
|
||||
// Mark journal sector writes as submitted
|
||||
journal.sector_info[s].submit_id = 0;
|
||||
}
|
||||
journal.submitting_sectors.clear();
|
||||
if ((initial_ring_space - ringloop->space_left()) > 0)
|
||||
{
|
||||
live = true;
|
||||
@@ -248,7 +213,7 @@ bool blockstore_impl_t::is_safe_to_stop()
|
||||
{
|
||||
return false;
|
||||
}
|
||||
if (unsynced_big_writes.size() > 0 || unsynced_small_writes.size() > 0)
|
||||
if (unsynced_big_write_count > 0 || unsynced_small_write_count > 0)
|
||||
{
|
||||
if (!readonly && !stop_sync_submitted)
|
||||
{
|
||||
@@ -272,7 +237,7 @@ void blockstore_impl_t::check_wait(blockstore_op_t *op)
|
||||
{
|
||||
if (PRIV(op)->wait_for == WAIT_SQE)
|
||||
{
|
||||
if (ringloop->sqes_left() < PRIV(op)->wait_detail)
|
||||
if (ringloop->space_left() < PRIV(op)->wait_detail)
|
||||
{
|
||||
// stop submission if there's still no free space
|
||||
#ifdef BLOCKSTORE_DEBUG
|
||||
@@ -282,40 +247,13 @@ void blockstore_impl_t::check_wait(blockstore_op_t *op)
|
||||
}
|
||||
PRIV(op)->wait_for = 0;
|
||||
}
|
||||
else if (PRIV(op)->wait_for == WAIT_JOURNAL)
|
||||
else if (PRIV(op)->wait_for == WAIT_COMPACTION)
|
||||
{
|
||||
if (journal.used_start == PRIV(op)->wait_detail &&
|
||||
(unstable_writes.size()+unstable_unsynced) == PRIV(op)->wait_detail2)
|
||||
if (flusher->get_compact_counter() <= PRIV(op)->wait_detail)
|
||||
{
|
||||
// do not submit
|
||||
#ifdef BLOCKSTORE_DEBUG
|
||||
printf("Still waiting to flush journal offset %08jx\n", PRIV(op)->wait_detail);
|
||||
#endif
|
||||
return;
|
||||
}
|
||||
flusher->release_trim();
|
||||
PRIV(op)->wait_for = 0;
|
||||
}
|
||||
else if (PRIV(op)->wait_for == WAIT_JOURNAL_BUFFER)
|
||||
{
|
||||
int next = ((journal.cur_sector + 1) % journal.sector_count);
|
||||
if (journal.sector_info[next].flush_count > 0 ||
|
||||
journal.sector_info[next].dirty)
|
||||
{
|
||||
// do not submit
|
||||
#ifdef BLOCKSTORE_DEBUG
|
||||
printf("Still waiting for a journal buffer\n");
|
||||
#endif
|
||||
return;
|
||||
}
|
||||
PRIV(op)->wait_for = 0;
|
||||
}
|
||||
else if (PRIV(op)->wait_for == WAIT_FREE)
|
||||
{
|
||||
if (!data_alloc->get_free_count() && big_to_flush > 0)
|
||||
{
|
||||
#ifdef BLOCKSTORE_DEBUG
|
||||
printf("Still waiting for free space on the data device\n");
|
||||
printf("Still waiting for more flushes\n");
|
||||
#endif
|
||||
return;
|
||||
}
|
||||
@@ -361,75 +299,11 @@ void blockstore_impl_t::init_op(blockstore_op_t *op)
|
||||
{
|
||||
// Call constructor without allocating memory. We'll call destructor before returning op back
|
||||
new ((void*)op->private_data) blockstore_op_private_t;
|
||||
PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 0;
|
||||
PRIV(op)->wait_for = 0;
|
||||
PRIV(op)->op_state = 0;
|
||||
PRIV(op)->pending_ops = 0;
|
||||
}
|
||||
|
||||
static bool replace_stable(object_id oid, uint64_t version, int search_start, int search_end, obj_ver_id* list)
|
||||
{
|
||||
while (search_start < search_end)
|
||||
{
|
||||
int pos = search_start+(search_end-search_start)/2;
|
||||
if (oid < list[pos].oid)
|
||||
{
|
||||
search_end = pos;
|
||||
}
|
||||
else if (list[pos].oid < oid)
|
||||
{
|
||||
search_start = pos+1;
|
||||
}
|
||||
else
|
||||
{
|
||||
list[pos].version = version;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
blockstore_clean_db_t& blockstore_impl_t::clean_db_shard(object_id oid)
|
||||
{
|
||||
uint64_t pg_num = 0;
|
||||
uint64_t pool_id = (oid.inode >> (64-POOL_ID_BITS));
|
||||
auto sh_it = clean_db_settings.find(pool_id);
|
||||
if (sh_it != clean_db_settings.end())
|
||||
{
|
||||
// like map_to_pg()
|
||||
pg_num = (oid.stripe / sh_it->second.pg_stripe_size) % sh_it->second.pg_count + 1;
|
||||
}
|
||||
return clean_db_shards[(pool_id << (64-POOL_ID_BITS)) | pg_num];
|
||||
}
|
||||
|
||||
void blockstore_impl_t::reshard_clean_db(pool_id_t pool, uint32_t pg_count, uint32_t pg_stripe_size)
|
||||
{
|
||||
uint64_t pool_id = (uint64_t)pool;
|
||||
std::map<pool_pg_id_t, blockstore_clean_db_t> new_shards;
|
||||
auto sh_it = clean_db_shards.lower_bound((pool_id << (64-POOL_ID_BITS)));
|
||||
while (sh_it != clean_db_shards.end() &&
|
||||
(sh_it->first >> (64-POOL_ID_BITS)) == pool_id)
|
||||
{
|
||||
for (auto & pair: sh_it->second)
|
||||
{
|
||||
// like map_to_pg()
|
||||
uint64_t pg_num = (pair.first.stripe / pg_stripe_size) % pg_count + 1;
|
||||
uint64_t shard_id = (pool_id << (64-POOL_ID_BITS)) | pg_num;
|
||||
new_shards[shard_id][pair.first] = pair.second;
|
||||
}
|
||||
clean_db_shards.erase(sh_it++);
|
||||
}
|
||||
for (sh_it = new_shards.begin(); sh_it != new_shards.end(); sh_it++)
|
||||
{
|
||||
auto & to = clean_db_shards[sh_it->first];
|
||||
to.swap(sh_it->second);
|
||||
}
|
||||
clean_db_settings[pool_id] = (pool_shard_settings_t){
|
||||
.pg_count = pg_count,
|
||||
.pg_stripe_size = pg_stripe_size,
|
||||
};
|
||||
}
|
||||
|
||||
void blockstore_impl_t::process_list(blockstore_op_t *op)
|
||||
{
|
||||
uint32_t list_pg = op->pg_number+1;
|
||||
@@ -438,7 +312,8 @@ void blockstore_impl_t::process_list(blockstore_op_t *op)
|
||||
uint64_t min_inode = op->min_oid.inode;
|
||||
uint64_t max_inode = op->max_oid.inode;
|
||||
// Check PG
|
||||
if (pg_count != 0 && (pg_stripe_size < MIN_DATA_BLOCK_SIZE || list_pg > pg_count))
|
||||
if (!pg_count || (pg_stripe_size < MIN_DATA_BLOCK_SIZE || list_pg > pg_count) ||
|
||||
!INODE_POOL(min_inode) || INODE_POOL(min_inode) != INODE_POOL(max_inode))
|
||||
{
|
||||
op->retval = -EINVAL;
|
||||
FINISH_OP(op);
|
||||
@@ -446,250 +321,40 @@ void blockstore_impl_t::process_list(blockstore_op_t *op)
|
||||
}
|
||||
// Check if the DB needs resharding
|
||||
// (we don't know about PGs from the beginning, we only create "shards" here)
|
||||
uint64_t first_shard = 0, last_shard = UINT64_MAX;
|
||||
if (min_inode != 0 &&
|
||||
// Check if min_inode == max_inode == pool_id<<N, i.e. this is a pool listing
|
||||
(min_inode >> (64-POOL_ID_BITS)) == (max_inode >> (64-POOL_ID_BITS)))
|
||||
heap->reshard(INODE_POOL(min_inode), pg_count, pg_stripe_size);
|
||||
obj_ver_id *result = NULL;
|
||||
size_t stable_count = 0, unstable_count = 0;
|
||||
int res = heap->list_objects(list_pg, op->min_oid, op->max_oid, &result, &stable_count, &unstable_count);
|
||||
if (op->list_stable_limit)
|
||||
{
|
||||
pool_id_t pool_id = (min_inode >> (64-POOL_ID_BITS));
|
||||
if (pg_count > 1)
|
||||
// Ordered result is expected - used by scrub
|
||||
// We use an unordered map
|
||||
std::sort(result, result + stable_count);
|
||||
if (stable_count > op->list_stable_limit)
|
||||
{
|
||||
// Per-pg listing
|
||||
auto sh_it = clean_db_settings.find(pool_id);
|
||||
if (sh_it == clean_db_settings.end() ||
|
||||
sh_it->second.pg_count != pg_count ||
|
||||
sh_it->second.pg_stripe_size != pg_stripe_size)
|
||||
{
|
||||
reshard_clean_db(pool_id, pg_count, pg_stripe_size);
|
||||
}
|
||||
first_shard = last_shard = ((uint64_t)pool_id << (64-POOL_ID_BITS)) | list_pg;
|
||||
}
|
||||
else
|
||||
{
|
||||
// Per-pool listing
|
||||
first_shard = ((uint64_t)pool_id << (64-POOL_ID_BITS));
|
||||
last_shard = ((uint64_t)(pool_id+1) << (64-POOL_ID_BITS)) - 1;
|
||||
memmove(result + op->list_stable_limit, result + stable_count, unstable_count);
|
||||
stable_count = op->list_stable_limit;
|
||||
}
|
||||
}
|
||||
// Copy clean_db entries
|
||||
int stable_count = 0, stable_alloc = 0;
|
||||
if (min_inode != max_inode)
|
||||
{
|
||||
for (auto shard_it = clean_db_shards.lower_bound(first_shard);
|
||||
shard_it != clean_db_shards.end() && shard_it->first <= last_shard;
|
||||
shard_it++)
|
||||
{
|
||||
auto & clean_db = shard_it->second;
|
||||
stable_alloc += clean_db.size();
|
||||
}
|
||||
}
|
||||
if (op->list_stable_limit > 0)
|
||||
{
|
||||
stable_alloc = op->list_stable_limit;
|
||||
if (stable_alloc > 1024*1024)
|
||||
stable_alloc = 1024*1024;
|
||||
}
|
||||
if (stable_alloc < 32768)
|
||||
{
|
||||
stable_alloc = 32768;
|
||||
}
|
||||
obj_ver_id *stable = (obj_ver_id*)malloc(sizeof(obj_ver_id) * stable_alloc);
|
||||
if (!stable)
|
||||
{
|
||||
op->retval = -ENOMEM;
|
||||
FINISH_OP(op);
|
||||
return;
|
||||
}
|
||||
auto max_oid = op->max_oid;
|
||||
bool limited = false;
|
||||
pool_pg_id_t last_shard_id = 0;
|
||||
for (auto shard_it = clean_db_shards.lower_bound(first_shard);
|
||||
shard_it != clean_db_shards.end() && shard_it->first <= last_shard;
|
||||
shard_it++)
|
||||
{
|
||||
auto & clean_db = shard_it->second;
|
||||
auto clean_it = clean_db.begin(), clean_end = clean_db.end();
|
||||
if (op->min_oid.inode != 0 || op->min_oid.stripe != 0)
|
||||
{
|
||||
clean_it = clean_db.lower_bound(op->min_oid);
|
||||
}
|
||||
if ((max_oid.inode != 0 || max_oid.stripe != 0) && !(max_oid < op->min_oid))
|
||||
{
|
||||
clean_end = clean_db.upper_bound(max_oid);
|
||||
}
|
||||
for (; clean_it != clean_end; clean_it++)
|
||||
{
|
||||
if (stable_count >= stable_alloc)
|
||||
{
|
||||
stable_alloc *= 2;
|
||||
obj_ver_id* nst = (obj_ver_id*)realloc(stable, sizeof(obj_ver_id) * stable_alloc);
|
||||
if (!nst)
|
||||
{
|
||||
op->retval = -ENOMEM;
|
||||
FINISH_OP(op);
|
||||
return;
|
||||
}
|
||||
stable = nst;
|
||||
}
|
||||
stable[stable_count++] = {
|
||||
.oid = clean_it->first,
|
||||
.version = clean_it->second.version,
|
||||
};
|
||||
if (op->list_stable_limit > 0 && stable_count >= op->list_stable_limit)
|
||||
{
|
||||
if (!limited)
|
||||
{
|
||||
limited = true;
|
||||
max_oid = stable[stable_count-1].oid;
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (op->list_stable_limit > 0)
|
||||
{
|
||||
// To maintain the order, we have to include objects in the same range from other shards
|
||||
if (last_shard_id != 0 && last_shard_id != shard_it->first)
|
||||
std::sort(stable, stable+stable_count);
|
||||
if (stable_count > op->list_stable_limit)
|
||||
stable_count = op->list_stable_limit;
|
||||
}
|
||||
last_shard_id = shard_it->first;
|
||||
}
|
||||
if (op->list_stable_limit == 0 && first_shard != last_shard)
|
||||
{
|
||||
// If that's not a per-PG listing, sort clean entries (already sorted if list_stable_limit != 0)
|
||||
std::sort(stable, stable+stable_count);
|
||||
}
|
||||
int clean_stable_count = stable_count;
|
||||
// Copy dirty_db entries (sorted, too)
|
||||
int unstable_count = 0, unstable_alloc = 0;
|
||||
obj_ver_id *unstable = NULL;
|
||||
{
|
||||
auto dirty_it = dirty_db.begin(), dirty_end = dirty_db.end();
|
||||
if (op->min_oid.inode != 0 || op->min_oid.stripe != 0)
|
||||
{
|
||||
dirty_it = dirty_db.lower_bound({
|
||||
.oid = op->min_oid,
|
||||
.version = 0,
|
||||
});
|
||||
}
|
||||
if ((max_oid.inode != 0 || max_oid.stripe != 0) && !(max_oid < op->min_oid))
|
||||
{
|
||||
dirty_end = dirty_db.upper_bound({
|
||||
.oid = max_oid,
|
||||
.version = UINT64_MAX,
|
||||
});
|
||||
}
|
||||
for (; dirty_it != dirty_end; dirty_it++)
|
||||
{
|
||||
if (!pg_count || ((dirty_it->first.oid.stripe / pg_stripe_size) % pg_count + 1) == list_pg) // like map_to_pg()
|
||||
{
|
||||
if (IS_DELETE(dirty_it->second.state))
|
||||
{
|
||||
// Deletions are always stable, so try to zero out two possible entries
|
||||
if (!replace_stable(dirty_it->first.oid, 0, 0, clean_stable_count, stable))
|
||||
{
|
||||
replace_stable(dirty_it->first.oid, 0, clean_stable_count, stable_count, stable);
|
||||
}
|
||||
}
|
||||
else if (IS_STABLE(dirty_it->second.state) || (dirty_it->second.state & BS_ST_INSTANT))
|
||||
{
|
||||
// First try to replace a clean stable version in the first part of the list
|
||||
if (!replace_stable(dirty_it->first.oid, dirty_it->first.version, 0, clean_stable_count, stable))
|
||||
{
|
||||
// Then try to replace the last dirty stable version in the second part of the list
|
||||
if (stable_count > 0 && stable[stable_count-1].oid == dirty_it->first.oid)
|
||||
{
|
||||
stable[stable_count-1].version = dirty_it->first.version;
|
||||
}
|
||||
else
|
||||
{
|
||||
if (stable_count >= stable_alloc)
|
||||
{
|
||||
stable_alloc += 32768;
|
||||
obj_ver_id *nst = (obj_ver_id*)realloc(stable, sizeof(obj_ver_id) * stable_alloc);
|
||||
if (!nst)
|
||||
{
|
||||
if (unstable)
|
||||
free(unstable);
|
||||
op->retval = -ENOMEM;
|
||||
FINISH_OP(op);
|
||||
return;
|
||||
}
|
||||
stable = nst;
|
||||
}
|
||||
stable[stable_count++] = dirty_it->first;
|
||||
}
|
||||
}
|
||||
if (op->list_stable_limit > 0 && stable_count >= op->list_stable_limit)
|
||||
{
|
||||
// Stop here
|
||||
break;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if (unstable_count >= unstable_alloc)
|
||||
{
|
||||
unstable_alloc += 32768;
|
||||
obj_ver_id *nst = (obj_ver_id*)realloc(unstable, sizeof(obj_ver_id) * unstable_alloc);
|
||||
if (!nst)
|
||||
{
|
||||
if (stable)
|
||||
free(stable);
|
||||
op->retval = -ENOMEM;
|
||||
FINISH_OP(op);
|
||||
return;
|
||||
}
|
||||
unstable = nst;
|
||||
}
|
||||
unstable[unstable_count++] = dirty_it->first;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
// Remove zeroed out stable entries
|
||||
int j = 0;
|
||||
for (int i = 0; i < stable_count; i++)
|
||||
{
|
||||
if (stable[i].version != 0)
|
||||
{
|
||||
stable[j++] = stable[i];
|
||||
}
|
||||
}
|
||||
stable_count = j;
|
||||
if (stable_count+unstable_count > stable_alloc)
|
||||
{
|
||||
stable_alloc = stable_count+unstable_count;
|
||||
obj_ver_id *nst = (obj_ver_id*)realloc(stable, sizeof(obj_ver_id) * stable_alloc);
|
||||
if (!nst)
|
||||
{
|
||||
if (unstable)
|
||||
free(unstable);
|
||||
op->retval = -ENOMEM;
|
||||
FINISH_OP(op);
|
||||
return;
|
||||
}
|
||||
stable = nst;
|
||||
}
|
||||
// Copy unstable entries
|
||||
for (int i = 0; i < unstable_count; i++)
|
||||
{
|
||||
stable[j++] = unstable[i];
|
||||
}
|
||||
free(unstable);
|
||||
op->version = stable_count;
|
||||
op->retval = stable_count+unstable_count;
|
||||
op->buf = stable;
|
||||
op->retval = res == 0 ? stable_count+unstable_count : -res;
|
||||
op->buf = (uint8_t*)result;
|
||||
FINISH_OP(op);
|
||||
}
|
||||
|
||||
void blockstore_impl_t::dump_diagnostics()
|
||||
{
|
||||
journal.dump_diagnostics();
|
||||
flusher->dump_diagnostics();
|
||||
}
|
||||
|
||||
void blockstore_meta_header_v3_t::set_crc32c()
|
||||
{
|
||||
header_csum = 0;
|
||||
uint32_t calc = crc32c(0, this, version == BLOCKSTORE_META_FORMAT_HEAP
|
||||
? sizeof(blockstore_meta_header_v3_t) : sizeof(blockstore_meta_header_v2_t));
|
||||
header_csum = calc;
|
||||
}
|
||||
|
||||
void blockstore_impl_t::disk_error_abort(const char *op, int retval, int expected)
|
||||
{
|
||||
if (retval == -EAGAIN)
|
||||
@@ -703,85 +368,7 @@ void blockstore_impl_t::disk_error_abort(const char *op, int retval, int expecte
|
||||
exit(1);
|
||||
}
|
||||
|
||||
void blockstore_impl_t::set_no_inode_stats(const std::vector<uint64_t> & pool_ids)
|
||||
uint64_t blockstore_impl_t::get_free_block_count()
|
||||
{
|
||||
for (auto & np: no_inode_stats)
|
||||
{
|
||||
np.second = 2;
|
||||
}
|
||||
for (auto pool_id: pool_ids)
|
||||
{
|
||||
if (!no_inode_stats[pool_id])
|
||||
recalc_inode_space_stats(pool_id, false);
|
||||
no_inode_stats[pool_id] = 1;
|
||||
}
|
||||
for (auto np_it = no_inode_stats.begin(); np_it != no_inode_stats.end(); )
|
||||
{
|
||||
if (np_it->second == 2)
|
||||
{
|
||||
recalc_inode_space_stats(np_it->first, true);
|
||||
no_inode_stats.erase(np_it++);
|
||||
}
|
||||
else
|
||||
np_it++;
|
||||
}
|
||||
}
|
||||
|
||||
void blockstore_impl_t::recalc_inode_space_stats(uint64_t pool_id, bool per_inode)
|
||||
{
|
||||
auto sp_begin = inode_space_stats.lower_bound((pool_id << (64-POOL_ID_BITS)));
|
||||
auto sp_end = inode_space_stats.lower_bound(((pool_id+1) << (64-POOL_ID_BITS)));
|
||||
inode_space_stats.erase(sp_begin, sp_end);
|
||||
auto sh_it = clean_db_shards.lower_bound((pool_id << (64-POOL_ID_BITS)));
|
||||
while (sh_it != clean_db_shards.end() &&
|
||||
(sh_it->first >> (64-POOL_ID_BITS)) == pool_id)
|
||||
{
|
||||
for (auto & pair: sh_it->second)
|
||||
{
|
||||
uint64_t space_id = per_inode ? pair.first.inode : (pool_id << (64-POOL_ID_BITS));
|
||||
inode_space_stats[space_id] += dsk.data_block_size;
|
||||
}
|
||||
sh_it++;
|
||||
}
|
||||
object_id last_oid = {};
|
||||
bool last_exists = false;
|
||||
auto dirty_it = dirty_db.lower_bound((obj_ver_id){ .oid = { .inode = (pool_id << (64-POOL_ID_BITS)) } });
|
||||
while (dirty_it != dirty_db.end() && (dirty_it->first.oid.inode >> (64-POOL_ID_BITS)) == pool_id)
|
||||
{
|
||||
if (IS_STABLE(dirty_it->second.state) && (IS_BIG_WRITE(dirty_it->second.state) || IS_DELETE(dirty_it->second.state)))
|
||||
{
|
||||
bool exists = false;
|
||||
if (last_oid == dirty_it->first.oid)
|
||||
{
|
||||
exists = last_exists;
|
||||
}
|
||||
else
|
||||
{
|
||||
auto & clean_db = clean_db_shard(dirty_it->first.oid);
|
||||
auto clean_it = clean_db.find(dirty_it->first.oid);
|
||||
exists = clean_it != clean_db.end();
|
||||
}
|
||||
uint64_t space_id = per_inode ? dirty_it->first.oid.inode : (pool_id << (64-POOL_ID_BITS));
|
||||
if (IS_BIG_WRITE(dirty_it->second.state))
|
||||
{
|
||||
if (!exists)
|
||||
inode_space_stats[space_id] += dsk.data_block_size;
|
||||
last_exists = true;
|
||||
}
|
||||
else
|
||||
{
|
||||
if (exists)
|
||||
{
|
||||
auto & sp = inode_space_stats[space_id];
|
||||
if (sp > dsk.data_block_size)
|
||||
sp -= dsk.data_block_size;
|
||||
else
|
||||
inode_space_stats.erase(space_id);
|
||||
}
|
||||
last_exists = false;
|
||||
}
|
||||
last_oid = dirty_it->first.oid;
|
||||
}
|
||||
dirty_it++;
|
||||
}
|
||||
return dsk.block_count - heap->get_data_used_space()/dsk.data_block_size;
|
||||
}
|
||||
|
@@ -5,6 +5,7 @@
|
||||
|
||||
#include "blockstore.h"
|
||||
#include "blockstore_disk.h"
|
||||
#include "blockstore_heap.h"
|
||||
|
||||
#include <sys/types.h>
|
||||
#include <sys/ioctl.h>
|
||||
@@ -21,45 +22,16 @@
|
||||
#include <unordered_map>
|
||||
#include <unordered_set>
|
||||
|
||||
#include "cpp-btree/btree_map.h"
|
||||
|
||||
#include "malloc_or_die.h"
|
||||
#include "allocator.h"
|
||||
|
||||
//#define BLOCKSTORE_DEBUG
|
||||
|
||||
// States are not stored on disk. Instead, they're deduced from the journal
|
||||
|
||||
#define BS_ST_SMALL_WRITE 0x01
|
||||
#define BS_ST_BIG_WRITE 0x02
|
||||
#define BS_ST_DELETE 0x03
|
||||
|
||||
#define BS_ST_WAIT_DEL 0x10
|
||||
#define BS_ST_WAIT_BIG 0x20
|
||||
#define BS_ST_IN_FLIGHT 0x30
|
||||
#define BS_ST_SUBMITTED 0x40
|
||||
#define BS_ST_WRITTEN 0x50
|
||||
#define BS_ST_SYNCED 0x60
|
||||
#define BS_ST_STABLE 0x70
|
||||
|
||||
#define BS_ST_INSTANT 0x100
|
||||
|
||||
#define IMMEDIATE_NONE 0
|
||||
#define IMMEDIATE_SMALL 1
|
||||
#define IMMEDIATE_ALL 2
|
||||
|
||||
#define BS_ST_TYPE_MASK 0x0F
|
||||
#define BS_ST_WORKFLOW_MASK 0xF0
|
||||
#define IS_IN_FLIGHT(st) (((st) & 0xF0) <= BS_ST_SUBMITTED)
|
||||
#define IS_STABLE(st) (((st) & 0xF0) == BS_ST_STABLE)
|
||||
#define IS_SYNCED(st) (((st) & 0xF0) >= BS_ST_SYNCED)
|
||||
#define IS_JOURNAL(st) (((st) & 0x0F) == BS_ST_SMALL_WRITE)
|
||||
#define IS_BIG_WRITE(st) (((st) & 0x0F) == BS_ST_BIG_WRITE)
|
||||
#define IS_DELETE(st) (((st) & 0x0F) == BS_ST_DELETE)
|
||||
#define IS_INSTANT(st) (((st) & BS_ST_TYPE_MASK) == BS_ST_DELETE || ((st) & BS_ST_INSTANT))
|
||||
|
||||
#define BS_SUBMIT_CHECK_SQES(n) \
|
||||
if (ringloop->sqes_left() < (n))\
|
||||
if (ringloop->space_left() < (n))\
|
||||
{\
|
||||
/* Pause until there are more requests available */\
|
||||
PRIV(op)->wait_detail = (n);\
|
||||
@@ -91,13 +63,6 @@
|
||||
return 0;\
|
||||
}
|
||||
|
||||
#include "blockstore_journal.h"
|
||||
|
||||
// "VITAstor"
|
||||
#define BLOCKSTORE_META_MAGIC_V1 0x726F747341544956l
|
||||
#define BLOCKSTORE_META_FORMAT_V1 1
|
||||
#define BLOCKSTORE_META_FORMAT_V2 2
|
||||
|
||||
// metadata header (superblock)
|
||||
struct __attribute__((__packed__)) blockstore_meta_header_v1_t
|
||||
{
|
||||
@@ -122,75 +87,26 @@ struct __attribute__((__packed__)) blockstore_meta_header_v2_t
|
||||
uint32_t header_csum;
|
||||
};
|
||||
|
||||
// 32 bytes = 24 bytes + block bitmap (4 bytes by default) + external attributes (also bitmap, 4 bytes by default)
|
||||
// per "clean" entry on disk with fixed metadata tables
|
||||
struct __attribute__((__packed__)) clean_disk_entry
|
||||
struct __attribute__((__packed__)) blockstore_meta_header_v3_t
|
||||
{
|
||||
object_id oid;
|
||||
uint64_t zero;
|
||||
uint64_t magic;
|
||||
uint64_t version;
|
||||
uint8_t bitmap[];
|
||||
// Two more fields come after bitmap in metadata version 2:
|
||||
// uint32_t data_csum[];
|
||||
// uint32_t entry_csum;
|
||||
};
|
||||
uint32_t meta_block_size;
|
||||
uint32_t data_block_size;
|
||||
uint32_t bitmap_granularity;
|
||||
uint32_t data_csum_type;
|
||||
uint32_t csum_block_size;
|
||||
uint32_t header_csum;
|
||||
uint64_t compacted_lsn;
|
||||
|
||||
// 32 = 16 + 16 bytes per "clean" entry in memory (object_id => clean_entry)
|
||||
struct __attribute__((__packed__)) clean_entry
|
||||
{
|
||||
uint64_t version;
|
||||
uint64_t location;
|
||||
void set_crc32c();
|
||||
};
|
||||
|
||||
// 64 = 24 + 40 bytes per dirty entry in memory (obj_ver_id => dirty_entry). Plus checksums
|
||||
struct __attribute__((__packed__)) dirty_entry
|
||||
{
|
||||
uint32_t state;
|
||||
uint32_t flags; // unneeded, but present for alignment
|
||||
uint64_t location; // location in either journal or data -> in BYTES
|
||||
uint32_t offset; // data offset within object (stripe)
|
||||
uint32_t len; // data length
|
||||
uint64_t journal_sector; // journal sector used for this entry
|
||||
void* dyn_data; // dynamic data: external bitmap and data block checksums. may be a pointer to the in-memory journal
|
||||
};
|
||||
|
||||
// - Sync must be submitted after previous writes/deletes (not before!)
|
||||
// - Reads to the same object must be submitted after previous writes/deletes
|
||||
// are written (not necessarily synced) in their location. This is because we
|
||||
// rely on read-modify-write for erasure coding and we must return new data
|
||||
// to calculate parity for subsequent writes
|
||||
// - Writes may be submitted in any order, because they don't overlap. Each write
|
||||
// goes into a new location - either on the journal device or on the data device
|
||||
// - Stable (stabilize) must be submitted after sync of that object is completed
|
||||
// It's even OK to return an error to the caller if that object is not synced yet
|
||||
// - Journal trim may be processed only after all versions are moved to
|
||||
// the main storage AND after all read operations for older versions complete
|
||||
// - If an operation can not be submitted because the ring is full
|
||||
// we should stop submission of other operations. Otherwise some "scatter" reads
|
||||
// may end up blocked for a long time.
|
||||
// Otherwise, the submit order is free, that is all operations may be submitted immediately
|
||||
// In fact, adding a write operation must immediately result in dirty_db being populated
|
||||
|
||||
// Suspend operation until there are more free SQEs
|
||||
#define WAIT_SQE 1
|
||||
// Suspend operation until there are <wait_detail> bytes of free space in the journal on disk
|
||||
#define WAIT_JOURNAL 3
|
||||
// Suspend operation until the next journal sector buffer is free
|
||||
#define WAIT_JOURNAL_BUFFER 4
|
||||
// Suspend operation until there is some free space on the data device
|
||||
#define WAIT_FREE 5
|
||||
|
||||
struct used_clean_obj_t
|
||||
{
|
||||
int refs;
|
||||
bool was_freed; // was freed by a parallel flush?
|
||||
bool was_changed; // was changed by a parallel flush?
|
||||
};
|
||||
|
||||
// https://github.com/algorithm-ninja/cpp-btree
|
||||
// https://github.com/greg7mdp/sparsepp/ was used previously, but it was TERRIBLY slow after resizing
|
||||
// with sparsepp, random reads dropped to ~700 iops very fast with just as much as ~32k objects in the DB
|
||||
typedef btree::btree_map<object_id, clean_entry> blockstore_clean_db_t;
|
||||
typedef std::map<obj_ver_id, dirty_entry> blockstore_dirty_db_t;
|
||||
// Suspend until something is compacted
|
||||
#define WAIT_COMPACTION 2
|
||||
|
||||
#include "blockstore_init.h"
|
||||
|
||||
@@ -203,58 +119,47 @@ struct blockstore_op_private_t
|
||||
{
|
||||
// Wait status
|
||||
int wait_for;
|
||||
uint64_t wait_detail, wait_detail2;
|
||||
uint64_t wait_detail;
|
||||
int pending_ops;
|
||||
int op_state;
|
||||
|
||||
// Read, write, sync, stabilize
|
||||
uint64_t lsn;
|
||||
|
||||
// Read
|
||||
uint64_t clean_block_used;
|
||||
std::vector<copy_buffer_t> read_vec;
|
||||
|
||||
// Sync, write
|
||||
uint64_t min_flushed_journal_sector, max_flushed_journal_sector;
|
||||
// Write
|
||||
uint64_t location;
|
||||
bool is_big;
|
||||
|
||||
// Stabilize, rollback
|
||||
int stab_pos;
|
||||
|
||||
// Stabilize
|
||||
uint64_t to_lsn;
|
||||
|
||||
// Write
|
||||
struct iovec iov_zerofill[3];
|
||||
// Warning: must not have a default value here because it's written to before calling constructor in blockstore_write.cpp O_o
|
||||
uint64_t real_version;
|
||||
timespec tv_begin;
|
||||
|
||||
// Sync
|
||||
std::vector<obj_ver_id> sync_big_writes, sync_small_writes;
|
||||
};
|
||||
|
||||
typedef uint32_t pool_id_t;
|
||||
typedef uint64_t pool_pg_id_t;
|
||||
|
||||
#define POOL_ID_BITS 16
|
||||
|
||||
struct pool_shard_settings_t
|
||||
{
|
||||
uint32_t pg_count;
|
||||
uint32_t pg_stripe_size;
|
||||
};
|
||||
|
||||
#define STAB_SPLIT_DONE 1
|
||||
#define STAB_SPLIT_WAIT 2
|
||||
#define STAB_SPLIT_SYNC 3
|
||||
#define STAB_SPLIT_TODO 4
|
||||
|
||||
class blockstore_impl_t
|
||||
{
|
||||
public:
|
||||
blockstore_disk_t dsk;
|
||||
|
||||
/******* OPTIONS *******/
|
||||
bool readonly = false;
|
||||
// It is safe to disable fsync() if drive write cache is writethrough
|
||||
bool disable_data_fsync = false, disable_meta_fsync = false, disable_journal_fsync = false;
|
||||
// Enable if you want every operation to be executed with an "implicit fsync"
|
||||
// Suitable only for server SSDs with capacitors, requires disabled data and journal fsyncs
|
||||
int immediate_commit = IMMEDIATE_NONE;
|
||||
bool inmemory_meta = false;
|
||||
uint32_t meta_write_recheck_parallelism = 0;
|
||||
// Maximum and minimum flusher count
|
||||
unsigned max_flusher_count, min_flusher_count;
|
||||
unsigned journal_trim_interval;
|
||||
unsigned max_flusher_count = 0, min_flusher_count = 0;
|
||||
unsigned journal_trim_interval = 0;
|
||||
unsigned flusher_start_threshold = 0;
|
||||
// Maximum queue depth
|
||||
unsigned max_write_iodepth = 128;
|
||||
// Enable small (journaled) write throttling, useful for the SSD+HDD case
|
||||
@@ -269,139 +174,89 @@ class blockstore_impl_t
|
||||
uint64_t autosync_writes = 128;
|
||||
// Log level (0-10)
|
||||
int log_level = 0;
|
||||
// Enable correct block checksum validation on objects updated with small writes when checksum block
|
||||
// is larger than bitmap_granularity, at the expense of extra metadata fsyncs during compaction
|
||||
bool perfect_csum_update = false;
|
||||
/******* END OF OPTIONS *******/
|
||||
|
||||
struct ring_consumer_t ring_consumer;
|
||||
|
||||
std::map<pool_id_t, pool_shard_settings_t> clean_db_settings;
|
||||
std::map<pool_pg_id_t, blockstore_clean_db_t> clean_db_shards;
|
||||
std::map<uint64_t, int> no_inode_stats;
|
||||
uint8_t *clean_bitmaps = NULL;
|
||||
blockstore_dirty_db_t dirty_db;
|
||||
blockstore_heap_t *heap = NULL;
|
||||
uint8_t* meta_superblock = NULL;
|
||||
uint8_t *buffer_area = NULL;
|
||||
std::vector<blockstore_op_t*> submit_queue;
|
||||
std::vector<obj_ver_id> unsynced_big_writes, unsynced_small_writes;
|
||||
int unsynced_big_write_count = 0, unstable_unsynced = 0;
|
||||
int unsynced_big_write_count = 0, unsynced_small_write_count = 0, unsynced_meta_write_count = 0;
|
||||
int unsynced_queued_ops = 0;
|
||||
allocator_t *data_alloc = NULL;
|
||||
uint64_t used_blocks = 0;
|
||||
uint8_t *zero_object = NULL;
|
||||
|
||||
void *metadata_buffer = NULL;
|
||||
|
||||
struct journal_t journal;
|
||||
journal_flusher_t *flusher;
|
||||
int big_to_flush = 0;
|
||||
int write_iodepth = 0;
|
||||
bool alloc_dyn_data = false;
|
||||
|
||||
// clean data blocks referenced by read operations
|
||||
std::map<uint64_t, used_clean_obj_t> used_clean_objects;
|
||||
int inflight_big = 0;
|
||||
bool fsyncing_data = false;
|
||||
|
||||
bool live = false, queue_stall = false;
|
||||
ring_loop_t *ringloop;
|
||||
timerfd_manager_t *tfd;
|
||||
ring_loop_i *ringloop = NULL;
|
||||
timerfd_manager_t *tfd = NULL;
|
||||
|
||||
bool stop_sync_submitted;
|
||||
bool stop_sync_submitted = false;
|
||||
|
||||
inline struct io_uring_sqe* get_sqe()
|
||||
{
|
||||
return ringloop->get_sqe();
|
||||
}
|
||||
|
||||
friend class blockstore_init_meta;
|
||||
friend class blockstore_init_journal;
|
||||
friend struct blockstore_journal_check_t;
|
||||
friend class journal_flusher_t;
|
||||
friend class journal_flusher_co;
|
||||
|
||||
void calc_lengths();
|
||||
void open_data();
|
||||
void open_meta();
|
||||
void open_journal();
|
||||
uint8_t* get_clean_entry_bitmap(uint64_t block_loc, int offset);
|
||||
|
||||
blockstore_clean_db_t& clean_db_shard(object_id oid);
|
||||
void reshard_clean_db(pool_id_t pool_id, uint32_t pg_count, uint32_t pg_stripe_size);
|
||||
void recalc_inode_space_stats(uint64_t pool_id, bool per_inode);
|
||||
|
||||
// Journaling
|
||||
void prepare_journal_sector_write(int sector, blockstore_op_t *op);
|
||||
void handle_journal_write(ring_data_t *data, uint64_t flush_id);
|
||||
void disk_error_abort(const char *op, int retval, int expected);
|
||||
|
||||
// Asynchronous init
|
||||
int initialized;
|
||||
int metadata_buf_size;
|
||||
blockstore_init_meta* metadata_init_reader;
|
||||
blockstore_init_journal* journal_init_reader;
|
||||
|
||||
void init();
|
||||
void check_wait(blockstore_op_t *op);
|
||||
void init_op(blockstore_op_t *op);
|
||||
|
||||
// Read
|
||||
int dequeue_read(blockstore_op_t *read_op);
|
||||
int dequeue_read(blockstore_op_t *op);
|
||||
int fulfill_read(blockstore_op_t *op);
|
||||
uint32_t prepare_read(std::vector<copy_buffer_t> & read_vec, heap_object_t *obj, heap_write_t *wr, uint32_t start, uint32_t end);
|
||||
uint32_t prepare_read_with_bitmaps(std::vector<copy_buffer_t> & read_vec, heap_object_t *obj, heap_write_t *wr, uint32_t start, uint32_t end);
|
||||
uint32_t prepare_read_zero(std::vector<copy_buffer_t> & read_vec, uint32_t start, uint32_t end);
|
||||
uint32_t prepare_read_simple(std::vector<copy_buffer_t> & read_vec, heap_object_t *obj, heap_write_t *wr, uint32_t start, uint32_t end);
|
||||
void prepare_disk_read(std::vector<copy_buffer_t> & read_vec, int pos, heap_object_t *obj, heap_write_t *wr,
|
||||
uint32_t blk_start, uint32_t blk_end, uint32_t start, uint32_t end, uint32_t copy_flags);
|
||||
void find_holes(std::vector<copy_buffer_t> & read_vec, uint32_t item_start, uint32_t item_end,
|
||||
std::function<int(int, bool, uint32_t, uint32_t)> callback);
|
||||
int fulfill_read(blockstore_op_t *read_op,
|
||||
uint64_t &fulfilled, uint32_t item_start, uint32_t item_end,
|
||||
uint32_t item_state, uint64_t item_version, uint64_t item_location,
|
||||
uint64_t journal_sector, uint8_t *csum, int *dyn_data);
|
||||
bool fulfill_clean_read(blockstore_op_t *read_op, uint64_t & fulfilled,
|
||||
uint8_t *clean_entry_bitmap, int *dyn_data,
|
||||
uint32_t item_start, uint32_t item_end, uint64_t clean_loc, uint64_t clean_ver);
|
||||
int fill_partial_checksum_blocks(std::vector<copy_buffer_t> & rv, uint64_t & fulfilled,
|
||||
uint8_t *clean_entry_bitmap, int *dyn_data, bool from_journal, uint8_t *read_buf, uint64_t read_offset, uint64_t read_end);
|
||||
int pad_journal_read(std::vector<copy_buffer_t> & rv, copy_buffer_t & cp,
|
||||
uint64_t dirty_offset, uint64_t dirty_end, uint64_t dirty_loc, uint8_t *csum_ptr, int *dyn_data,
|
||||
uint64_t offset, uint64_t submit_len, uint64_t & blk_begin, uint64_t & blk_end, uint8_t* & blk_buf);
|
||||
bool read_range_fulfilled(std::vector<copy_buffer_t> & rv, uint64_t & fulfilled, uint8_t *read_buf,
|
||||
uint8_t *clean_entry_bitmap, uint32_t item_start, uint32_t item_end);
|
||||
bool read_checksum_block(blockstore_op_t *op, int rv_pos, uint64_t &fulfilled, uint64_t clean_loc);
|
||||
uint8_t* read_clean_meta_block(blockstore_op_t *read_op, uint64_t clean_loc, int rv_pos);
|
||||
bool verify_padded_checksums(uint8_t *clean_entry_bitmap, uint8_t *csum_buf, uint32_t offset,
|
||||
iovec *iov, int n_iov, std::function<void(uint32_t, uint32_t, uint32_t)> bad_block_cb);
|
||||
bool verify_journal_checksums(uint8_t *csums, uint32_t offset,
|
||||
iovec *iov, int n_iov, std::function<void(uint32_t, uint32_t, uint32_t)> bad_block_cb);
|
||||
bool verify_clean_padded_checksums(blockstore_op_t *op, uint64_t clean_loc, uint8_t *dyn_data, bool from_journal,
|
||||
iovec *iov, int n_iov, std::function<void(uint32_t, uint32_t, uint32_t)> bad_block_cb);
|
||||
int fulfill_read_push(blockstore_op_t *op, void *buf, uint64_t offset, uint64_t len,
|
||||
uint32_t item_state, uint64_t item_version);
|
||||
std::function<void(int&, uint32_t, uint32_t)> callback);
|
||||
void free_read_buffers(std::vector<copy_buffer_t> & rv);
|
||||
void handle_read_event(ring_data_t *data, blockstore_op_t *op);
|
||||
bool verify_read_checksums(blockstore_op_t *op);
|
||||
|
||||
// Write
|
||||
bool enqueue_write(blockstore_op_t *op);
|
||||
void cancel_all_writes(blockstore_op_t *op, blockstore_dirty_db_t::iterator dirty_it, int retval);
|
||||
void prepare_meta_block_write(blockstore_op_t *op, uint64_t modified_block, io_uring_sqe *sqe = NULL);
|
||||
int dequeue_write(blockstore_op_t *op);
|
||||
int dequeue_del(blockstore_op_t *op);
|
||||
int make_big_write(blockstore_op_t *op, uint32_t offset, uint32_t len, uint32_t *modified_block, uint32_t *moved_from_block);
|
||||
int continue_write(blockstore_op_t *op);
|
||||
void release_journal_sectors(blockstore_op_t *op);
|
||||
void handle_write_event(ring_data_t *data, blockstore_op_t *op);
|
||||
|
||||
// Sync
|
||||
int continue_sync(blockstore_op_t *op);
|
||||
void ack_sync(blockstore_op_t *op);
|
||||
bool submit_fsyncs(int & wait_count);
|
||||
int do_sync(blockstore_op_t *op, int base_state);
|
||||
|
||||
// Stabilize
|
||||
int dequeue_stable(blockstore_op_t *op);
|
||||
int continue_stable(blockstore_op_t *op);
|
||||
void mark_stable(obj_ver_id ov, bool forget_dirty = false);
|
||||
void stabilize_object(object_id oid, uint64_t max_ver);
|
||||
blockstore_op_t* selective_sync(blockstore_op_t *op);
|
||||
int split_stab_op(blockstore_op_t *op, std::function<int(obj_ver_id v)> decider);
|
||||
|
||||
// Rollback
|
||||
int dequeue_rollback(blockstore_op_t *op);
|
||||
int continue_rollback(blockstore_op_t *op);
|
||||
void mark_rolled_back(const obj_ver_id & ov);
|
||||
void erase_dirty(blockstore_dirty_db_t::iterator dirty_start, blockstore_dirty_db_t::iterator dirty_end, uint64_t clean_loc);
|
||||
void free_dirty_dyn_data(dirty_entry & e);
|
||||
|
||||
// List
|
||||
void process_list(blockstore_op_t *op);
|
||||
|
||||
public:
|
||||
/*public:*/
|
||||
|
||||
blockstore_impl_t(blockstore_config_t & config, ring_loop_t *ringloop, timerfd_manager_t *tfd);
|
||||
blockstore_impl_t(blockstore_config_t & config, ring_loop_i *ringloop, timerfd_manager_t *tfd, bool mock_mode = false);
|
||||
~blockstore_impl_t();
|
||||
|
||||
void parse_config(blockstore_config_t & config, bool init);
|
||||
@@ -427,21 +282,13 @@ public:
|
||||
// Simplified synchronous operation: get object bitmap & current version
|
||||
int read_bitmap(object_id oid, uint64_t target_version, void *bitmap, uint64_t *result_version = NULL);
|
||||
|
||||
// Unstable writes are added here (map of object_id -> version)
|
||||
std::unordered_map<object_id, uint64_t> unstable_writes;
|
||||
|
||||
// Space usage statistics
|
||||
std::map<uint64_t, uint64_t> inode_space_stats;
|
||||
|
||||
// Set per-pool no_inode_stats
|
||||
void set_no_inode_stats(const std::vector<uint64_t> & pool_ids);
|
||||
|
||||
// Print diagnostics to stdout
|
||||
void dump_diagnostics();
|
||||
|
||||
const std::map<uint64_t, uint64_t> & get_inode_space_stats() { return heap->get_inode_space_stats(); }
|
||||
inline uint32_t get_block_size() { return dsk.data_block_size; }
|
||||
inline uint64_t get_block_count() { return dsk.block_count; }
|
||||
inline uint64_t get_free_block_count() { return dsk.block_count - used_blocks; }
|
||||
uint64_t get_free_block_count();
|
||||
inline uint32_t get_bitmap_granularity() { return dsk.disk_alignment; }
|
||||
inline uint64_t get_journal_size() { return dsk.journal_len; }
|
||||
};
|
||||
|
File diff suppressed because it is too large
Load Diff
@@ -25,47 +25,10 @@ class blockstore_init_meta
|
||||
uint64_t next_offset = 0;
|
||||
uint64_t last_read_offset = 0;
|
||||
uint64_t entries_loaded = 0;
|
||||
unsigned entries_per_block = 0;
|
||||
int i = 0, j = 0;
|
||||
std::vector<uint64_t> entries_to_zero;
|
||||
bool handle_meta_block(uint8_t *buf, uint64_t count, uint64_t done_cnt);
|
||||
void handle_event(ring_data_t *data, int buf_num);
|
||||
public:
|
||||
blockstore_init_meta(blockstore_impl_t *bs);
|
||||
int loop();
|
||||
};
|
||||
|
||||
struct bs_init_journal_done
|
||||
{
|
||||
void *buf;
|
||||
uint64_t pos, len;
|
||||
};
|
||||
|
||||
class blockstore_init_journal
|
||||
{
|
||||
blockstore_impl_t *bs;
|
||||
int wait_state = 0, wait_count = 0, handle_res = 0;
|
||||
uint64_t entries_loaded = 0;
|
||||
uint32_t crc32_last = 0;
|
||||
bool started = false;
|
||||
uint64_t next_free;
|
||||
std::vector<bs_init_journal_done> done;
|
||||
std::vector<obj_ver_id> double_allocs;
|
||||
std::vector<iovec> small_write_data;
|
||||
uint64_t journal_pos = 0;
|
||||
uint64_t continue_pos = 0;
|
||||
void *init_write_buf = NULL;
|
||||
uint64_t init_write_sector = 0;
|
||||
bool wrapped = false;
|
||||
void *submitted_buf;
|
||||
struct io_uring_sqe *sqe;
|
||||
struct ring_data_t *data;
|
||||
journal_entry_start *je_start;
|
||||
std::function<void(ring_data_t*)> simple_callback;
|
||||
int handle_journal_part(void *buf, uint64_t done_pos, uint64_t len);
|
||||
void handle_event(ring_data_t *data);
|
||||
void erase_dirty_object(blockstore_dirty_db_t::iterator dirty_it);
|
||||
public:
|
||||
blockstore_init_journal(blockstore_impl_t* bs);
|
||||
int loop();
|
||||
};
|
||||
|
@@ -1,328 +0,0 @@
|
||||
// Copyright (c) Vitaliy Filippov, 2019+
|
||||
// License: VNPL-1.1 (see README.md for details)
|
||||
|
||||
#include "blockstore_impl.h"
|
||||
|
||||
blockstore_journal_check_t::blockstore_journal_check_t(blockstore_impl_t *bs)
|
||||
{
|
||||
this->bs = bs;
|
||||
sectors_to_write = 0;
|
||||
next_pos = bs->journal.next_free;
|
||||
next_sector = bs->journal.cur_sector;
|
||||
first_sector = -1;
|
||||
next_in_pos = bs->journal.in_sector_pos;
|
||||
right_dir = next_pos >= bs->journal.used_start;
|
||||
}
|
||||
|
||||
// Check if we can write <required> entries of <size> bytes and <data_after> data bytes after them to the journal
|
||||
int blockstore_journal_check_t::check_available(blockstore_op_t *op, int entries_required, int size, int data_after)
|
||||
{
|
||||
uint64_t prev_next = next_sector;
|
||||
int required = entries_required;
|
||||
while (1)
|
||||
{
|
||||
int fits = bs->journal.no_same_sector_overwrites && next_pos == bs->journal.next_free && bs->journal.sector_info[next_sector].written
|
||||
? 0
|
||||
: (bs->journal.block_size - next_in_pos) / size;
|
||||
if (fits > 0)
|
||||
{
|
||||
if (fits > required)
|
||||
{
|
||||
fits = required;
|
||||
}
|
||||
if (first_sector == -1)
|
||||
{
|
||||
first_sector = next_sector;
|
||||
}
|
||||
required -= fits;
|
||||
next_in_pos += fits * size;
|
||||
if (next_sector != prev_next || !sectors_to_write)
|
||||
{
|
||||
// Except the previous call to this function
|
||||
sectors_to_write++;
|
||||
}
|
||||
}
|
||||
else if (bs->journal.sector_info[next_sector].dirty)
|
||||
{
|
||||
if (next_sector != prev_next || !sectors_to_write)
|
||||
{
|
||||
// Except the previous call to this function
|
||||
sectors_to_write++;
|
||||
}
|
||||
}
|
||||
if (required <= 0)
|
||||
{
|
||||
break;
|
||||
}
|
||||
next_pos = next_pos + bs->journal.block_size;
|
||||
if (next_pos >= bs->journal.len)
|
||||
{
|
||||
next_pos = bs->journal.block_size;
|
||||
right_dir = false;
|
||||
}
|
||||
next_in_pos = 0;
|
||||
next_sector = ((next_sector + 1) % bs->journal.sector_count);
|
||||
if (next_sector == first_sector)
|
||||
{
|
||||
// next_sector may wrap when all sectors are flushed and the incoming batch is too big
|
||||
// This is an error condition, we can't wait for anything in this case
|
||||
throw std::runtime_error(
|
||||
"Blockstore journal_sector_buffer_count="+std::to_string(bs->journal.sector_count)+
|
||||
" is too small for a batch of "+std::to_string(entries_required)+" entries of "+std::to_string(size)+" bytes"
|
||||
);
|
||||
}
|
||||
if (bs->journal.sector_info[next_sector].flush_count > 0 ||
|
||||
bs->journal.sector_info[next_sector].dirty)
|
||||
{
|
||||
// No memory buffer available. Wait for it.
|
||||
int used = 0, dirty = 0;
|
||||
for (int i = 0; i < bs->journal.sector_count; i++)
|
||||
{
|
||||
if (bs->journal.sector_info[i].dirty)
|
||||
{
|
||||
dirty++;
|
||||
used++;
|
||||
}
|
||||
if (bs->journal.sector_info[i].flush_count > 0)
|
||||
{
|
||||
used++;
|
||||
}
|
||||
}
|
||||
// In fact, it's even more rare than "ran out of journal space", so print a warning
|
||||
printf(
|
||||
"Ran out of journal sector buffers: %d/%ju buffers used (%d dirty), next buffer (%jd)"
|
||||
" is %s and flushed %ju times. Consider increasing \'journal_sector_buffer_count\'\n",
|
||||
used, bs->journal.sector_count, dirty, next_sector,
|
||||
bs->journal.sector_info[next_sector].dirty ? "dirty" : "not dirty",
|
||||
bs->journal.sector_info[next_sector].flush_count
|
||||
);
|
||||
PRIV(op)->wait_for = WAIT_JOURNAL_BUFFER;
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
if (data_after > 0)
|
||||
{
|
||||
next_pos = next_pos + data_after;
|
||||
if (next_pos >= bs->journal.len)
|
||||
{
|
||||
if (right_dir)
|
||||
next_pos = bs->journal.block_size + data_after;
|
||||
right_dir = false;
|
||||
}
|
||||
}
|
||||
if (!right_dir && next_pos >= bs->journal.used_start-bs->journal.block_size)
|
||||
{
|
||||
// No space in the journal. Wait until used_start changes.
|
||||
if (bs->log_level > 5)
|
||||
{
|
||||
printf(
|
||||
"Ran out of journal space (used_start=%08jx, next_free=%08jx, dirty_start=%08jx)\n",
|
||||
bs->journal.used_start, bs->journal.next_free, bs->journal.dirty_start
|
||||
);
|
||||
}
|
||||
PRIV(op)->wait_for = WAIT_JOURNAL;
|
||||
bs->flusher->request_trim();
|
||||
PRIV(op)->wait_detail = bs->journal.used_start;
|
||||
return 0;
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
journal_entry* prefill_single_journal_entry(journal_t & journal, uint16_t type, uint32_t size)
|
||||
{
|
||||
if (!journal.entry_fits(size))
|
||||
{
|
||||
assert(!journal.sector_info[journal.cur_sector].dirty);
|
||||
// Move to the next journal sector
|
||||
if (journal.sector_info[journal.cur_sector].flush_count > 0)
|
||||
{
|
||||
// Also select next sector buffer in memory
|
||||
journal.cur_sector = ((journal.cur_sector + 1) % journal.sector_count);
|
||||
assert(!journal.sector_info[journal.cur_sector].flush_count);
|
||||
}
|
||||
else
|
||||
{
|
||||
journal.dirty_start = journal.next_free;
|
||||
}
|
||||
journal.sector_info[journal.cur_sector].written = false;
|
||||
journal.sector_info[journal.cur_sector].offset = journal.next_free;
|
||||
journal.in_sector_pos = 0;
|
||||
auto next_next_free = (journal.next_free+journal.block_size) < journal.len ? journal.next_free + journal.block_size : journal.block_size;
|
||||
// double check that next_free doesn't cross used_start from the left
|
||||
assert(journal.next_free >= journal.used_start && next_next_free >= journal.next_free || next_next_free < journal.used_start);
|
||||
journal.next_free = next_next_free;
|
||||
memset(journal.inmemory
|
||||
? (uint8_t*)journal.buffer + journal.sector_info[journal.cur_sector].offset
|
||||
: (uint8_t*)journal.sector_buf + journal.block_size*journal.cur_sector, 0, journal.block_size);
|
||||
}
|
||||
journal_entry *je = (struct journal_entry*)(
|
||||
(journal.inmemory
|
||||
? (uint8_t*)journal.buffer + journal.sector_info[journal.cur_sector].offset
|
||||
: (uint8_t*)journal.sector_buf + journal.block_size*journal.cur_sector) + journal.in_sector_pos
|
||||
);
|
||||
journal.in_sector_pos += size;
|
||||
je->magic = JOURNAL_MAGIC;
|
||||
je->type = type;
|
||||
je->size = size;
|
||||
je->crc32_prev = journal.crc32_last;
|
||||
journal.sector_info[journal.cur_sector].dirty = true;
|
||||
return je;
|
||||
}
|
||||
|
||||
void blockstore_impl_t::prepare_journal_sector_write(int cur_sector, blockstore_op_t *op)
|
||||
{
|
||||
// Don't submit the same sector twice in the same batch
|
||||
if (!journal.sector_info[cur_sector].submit_id)
|
||||
{
|
||||
io_uring_sqe *sqe = get_sqe();
|
||||
// Caller must ensure availability of an SQE
|
||||
assert(sqe != NULL);
|
||||
ring_data_t *data = ((ring_data_t*)sqe->user_data);
|
||||
journal.sector_info[cur_sector].written = true;
|
||||
journal.sector_info[cur_sector].submit_id = ++journal.submit_id;
|
||||
assert(journal.submit_id != 0); // check overflow
|
||||
journal.submitting_sectors.push_back(cur_sector);
|
||||
journal.sector_info[cur_sector].flush_count++;
|
||||
data->iov = (struct iovec){
|
||||
(journal.inmemory
|
||||
? (uint8_t*)journal.buffer + journal.sector_info[cur_sector].offset
|
||||
: (uint8_t*)journal.sector_buf + journal.block_size*cur_sector),
|
||||
(size_t)journal.block_size
|
||||
};
|
||||
data->callback = [this, flush_id = journal.submit_id](ring_data_t *data) { handle_journal_write(data, flush_id); };
|
||||
io_uring_prep_writev(
|
||||
sqe, dsk.journal_fd, &data->iov, 1, journal.offset + journal.sector_info[cur_sector].offset
|
||||
);
|
||||
}
|
||||
journal.sector_info[cur_sector].dirty = false;
|
||||
// But always remember that this operation has to wait until this exact journal write is finished
|
||||
journal.flushing_ops.emplace(journal.sector_info[cur_sector].submit_id, (pending_journaling_t){
|
||||
.pending = 1,
|
||||
.sector = cur_sector,
|
||||
.op = op,
|
||||
});
|
||||
auto priv = PRIV(op);
|
||||
priv->pending_ops++;
|
||||
if (!priv->min_flushed_journal_sector)
|
||||
priv->min_flushed_journal_sector = 1+cur_sector;
|
||||
assert(priv->min_flushed_journal_sector <= journal.sector_count);
|
||||
priv->max_flushed_journal_sector = 1+cur_sector;
|
||||
}
|
||||
|
||||
void blockstore_impl_t::handle_journal_write(ring_data_t *data, uint64_t flush_id)
|
||||
{
|
||||
live = true;
|
||||
if (data->res != data->iov.iov_len)
|
||||
{
|
||||
// FIXME: our state becomes corrupted after a write error. maybe do something better than just die
|
||||
disk_error_abort("journal write", data->res, data->iov.iov_len);
|
||||
}
|
||||
auto fl_it = journal.flushing_ops.lower_bound(flush_id);
|
||||
if (fl_it != journal.flushing_ops.end() && fl_it->first == flush_id && fl_it->second.sector >= 0)
|
||||
{
|
||||
journal.sector_info[fl_it->second.sector].flush_count--;
|
||||
}
|
||||
auto is_first = fl_it == journal.flushing_ops.begin();
|
||||
while (fl_it != journal.flushing_ops.end())
|
||||
{
|
||||
bool del = false;
|
||||
if (fl_it->first == flush_id)
|
||||
{
|
||||
fl_it->second.pending = 0;
|
||||
del = is_first;
|
||||
}
|
||||
else
|
||||
{
|
||||
del = !fl_it->second.pending;
|
||||
}
|
||||
if (del)
|
||||
{
|
||||
// Do not complete this operation if previous writes are unfinished
|
||||
// Otherwise also complete following operations waiting for this one
|
||||
auto priv = PRIV(fl_it->second.op);
|
||||
priv->pending_ops--;
|
||||
assert(priv->pending_ops >= 0);
|
||||
if (priv->pending_ops == 0)
|
||||
{
|
||||
release_journal_sectors(fl_it->second.op);
|
||||
priv->op_state++;
|
||||
ringloop->wakeup();
|
||||
}
|
||||
journal.flushing_ops.erase(fl_it++);
|
||||
}
|
||||
else
|
||||
{
|
||||
fl_it++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
journal_t::~journal_t()
|
||||
{
|
||||
if (sector_buf)
|
||||
free(sector_buf);
|
||||
if (sector_info)
|
||||
free(sector_info);
|
||||
if (buffer)
|
||||
free(buffer);
|
||||
sector_buf = NULL;
|
||||
sector_info = NULL;
|
||||
buffer = NULL;
|
||||
}
|
||||
|
||||
uint64_t journal_t::get_trim_pos()
|
||||
{
|
||||
auto journal_used_it = used_sectors.lower_bound(used_start);
|
||||
if (journal_used_it == used_sectors.end())
|
||||
{
|
||||
// Journal is cleared to its end, restart from the beginning
|
||||
journal_used_it = used_sectors.begin();
|
||||
if (journal_used_it == used_sectors.end())
|
||||
{
|
||||
// Journal is empty
|
||||
return next_free;
|
||||
}
|
||||
else
|
||||
{
|
||||
// next_free does not need updating during trim
|
||||
#ifdef BLOCKSTORE_DEBUG
|
||||
printf(
|
||||
"Trimming journal (used_start=%08jx, next_free=%08jx, dirty_start=%08jx, new_start=%08jx, new_refcount=%jd)\n",
|
||||
used_start, next_free, dirty_start,
|
||||
journal_used_it->first, journal_used_it->second
|
||||
);
|
||||
#endif
|
||||
return journal_used_it->first;
|
||||
}
|
||||
}
|
||||
else if (journal_used_it->first > used_start)
|
||||
{
|
||||
// Journal is cleared up to <journal_used_it>
|
||||
#ifdef BLOCKSTORE_DEBUG
|
||||
printf(
|
||||
"Trimming journal (used_start=%08jx, next_free=%08jx, dirty_start=%08jx, new_start=%08jx, new_refcount=%jd)\n",
|
||||
used_start, next_free, dirty_start,
|
||||
journal_used_it->first, journal_used_it->second
|
||||
);
|
||||
#endif
|
||||
return journal_used_it->first;
|
||||
}
|
||||
// Can't trim journal
|
||||
return used_start;
|
||||
}
|
||||
|
||||
void journal_t::dump_diagnostics()
|
||||
{
|
||||
auto journal_used_it = used_sectors.lower_bound(used_start);
|
||||
if (journal_used_it == used_sectors.end())
|
||||
{
|
||||
// Journal is cleared to its end, restart from the beginning
|
||||
journal_used_it = used_sectors.begin();
|
||||
}
|
||||
printf(
|
||||
"Journal: used_start=%08jx next_free=%08jx dirty_start=%08jx trim_to=%08jx trim_to_refs=%jd\n",
|
||||
used_start, next_free, dirty_start,
|
||||
journal_used_it == used_sectors.end() ? 0 : journal_used_it->first,
|
||||
journal_used_it == used_sectors.end() ? 0 : journal_used_it->second
|
||||
);
|
||||
}
|
@@ -2,6 +2,7 @@
|
||||
// License: VNPL-1.1 (see README.md for details)
|
||||
|
||||
#include <sys/file.h>
|
||||
#include <stdexcept>
|
||||
#include "blockstore_impl.h"
|
||||
|
||||
void blockstore_impl_t::parse_config(blockstore_config_t & config, bool init)
|
||||
@@ -14,12 +15,14 @@ void blockstore_impl_t::parse_config(blockstore_config_t & config, bool init)
|
||||
}
|
||||
min_flusher_count = strtoull(config["min_flusher_count"].c_str(), NULL, 10);
|
||||
journal_trim_interval = strtoull(config["journal_trim_interval"].c_str(), NULL, 10);
|
||||
flusher_start_threshold = strtoull(config["flusher_start_threshold"].c_str(), NULL, 10);
|
||||
max_write_iodepth = strtoull(config["max_write_iodepth"].c_str(), NULL, 10);
|
||||
throttle_small_writes = config["throttle_small_writes"] == "true" || config["throttle_small_writes"] == "1" || config["throttle_small_writes"] == "yes";
|
||||
throttle_target_iops = strtoull(config["throttle_target_iops"].c_str(), NULL, 10);
|
||||
throttle_target_mbs = strtoull(config["throttle_target_mbs"].c_str(), NULL, 10);
|
||||
throttle_target_parallelism = strtoull(config["throttle_target_parallelism"].c_str(), NULL, 10);
|
||||
throttle_threshold_us = strtoull(config["throttle_threshold_us"].c_str(), NULL, 10);
|
||||
perfect_csum_update = config["perfect_csum_update"] == "true" || config["perfect_csum_update"] == "1" || config["perfect_csum_update"] == "yes";
|
||||
if (config["autosync_writes"] != "")
|
||||
{
|
||||
autosync_writes = strtoull(config["autosync_writes"].c_str(), NULL, 10);
|
||||
@@ -28,13 +31,17 @@ void blockstore_impl_t::parse_config(blockstore_config_t & config, bool init)
|
||||
{
|
||||
max_flusher_count = 256;
|
||||
}
|
||||
if (!min_flusher_count || journal.flush_journal)
|
||||
if (!min_flusher_count)
|
||||
{
|
||||
min_flusher_count = 1;
|
||||
}
|
||||
if (!journal_trim_interval)
|
||||
{
|
||||
journal_trim_interval = 512;
|
||||
journal_trim_interval = 1024;
|
||||
}
|
||||
if (!flusher_start_threshold)
|
||||
{
|
||||
flusher_start_threshold = 32;
|
||||
}
|
||||
if (!max_write_iodepth)
|
||||
{
|
||||
@@ -68,23 +75,6 @@ void blockstore_impl_t::parse_config(blockstore_config_t & config, bool init)
|
||||
{
|
||||
readonly = true;
|
||||
}
|
||||
if (config["disable_data_fsync"] == "true" || config["disable_data_fsync"] == "1" || config["disable_data_fsync"] == "yes")
|
||||
{
|
||||
disable_data_fsync = true;
|
||||
}
|
||||
if (config["disable_meta_fsync"] == "true" || config["disable_meta_fsync"] == "1" || config["disable_meta_fsync"] == "yes")
|
||||
{
|
||||
disable_meta_fsync = true;
|
||||
}
|
||||
if (config["disable_journal_fsync"] == "true" || config["disable_journal_fsync"] == "1" || config["disable_journal_fsync"] == "yes")
|
||||
{
|
||||
disable_journal_fsync = true;
|
||||
}
|
||||
if (config["flush_journal"] == "true" || config["flush_journal"] == "1" || config["flush_journal"] == "yes")
|
||||
{
|
||||
// Only flush journal and exit
|
||||
journal.flush_journal = true;
|
||||
}
|
||||
if (config["immediate_commit"] == "all")
|
||||
{
|
||||
immediate_commit = IMMEDIATE_ALL;
|
||||
@@ -94,85 +84,23 @@ void blockstore_impl_t::parse_config(blockstore_config_t & config, bool init)
|
||||
immediate_commit = IMMEDIATE_SMALL;
|
||||
}
|
||||
metadata_buf_size = strtoull(config["meta_buf_size"].c_str(), NULL, 10);
|
||||
inmemory_meta = config["inmemory_metadata"] != "false" && config["inmemory_metadata"] != "0" &&
|
||||
config["inmemory_metadata"] != "no";
|
||||
journal.sector_count = strtoull(config["journal_sector_buffer_count"].c_str(), NULL, 10);
|
||||
journal.no_same_sector_overwrites = config["journal_no_same_sector_overwrites"] == "true" ||
|
||||
config["journal_no_same_sector_overwrites"] == "1" || config["journal_no_same_sector_overwrites"] == "yes";
|
||||
journal.inmemory = config["inmemory_journal"] != "false" && config["inmemory_journal"] != "0" &&
|
||||
config["inmemory_journal"] != "no";
|
||||
meta_write_recheck_parallelism = strtoull(config["meta_write_recheck_parallelism"].c_str(), NULL, 10);
|
||||
log_level = strtoull(config["log_level"].c_str(), NULL, 10);
|
||||
// Validate
|
||||
if (journal.sector_count < 2)
|
||||
{
|
||||
journal.sector_count = 32;
|
||||
}
|
||||
if (metadata_buf_size < 65536)
|
||||
{
|
||||
metadata_buf_size = 4*1024*1024;
|
||||
}
|
||||
if (dsk.meta_device == dsk.data_device)
|
||||
if (!meta_write_recheck_parallelism)
|
||||
{
|
||||
disable_meta_fsync = disable_data_fsync;
|
||||
meta_write_recheck_parallelism = 16;
|
||||
}
|
||||
if (dsk.journal_device == dsk.meta_device)
|
||||
{
|
||||
disable_journal_fsync = disable_meta_fsync;
|
||||
}
|
||||
if (immediate_commit != IMMEDIATE_NONE && !disable_journal_fsync)
|
||||
if (immediate_commit != IMMEDIATE_NONE && !dsk.disable_journal_fsync)
|
||||
{
|
||||
throw std::runtime_error("immediate_commit requires disable_journal_fsync");
|
||||
}
|
||||
if (immediate_commit == IMMEDIATE_ALL && !disable_data_fsync)
|
||||
if (immediate_commit == IMMEDIATE_ALL && !dsk.disable_data_fsync)
|
||||
{
|
||||
throw std::runtime_error("immediate_commit=all requires disable_journal_fsync and disable_data_fsync");
|
||||
}
|
||||
// init some fields
|
||||
journal.block_size = dsk.journal_block_size;
|
||||
journal.next_free = dsk.journal_block_size;
|
||||
journal.used_start = dsk.journal_block_size;
|
||||
// no free space because sector is initially unmapped
|
||||
journal.in_sector_pos = dsk.journal_block_size;
|
||||
}
|
||||
|
||||
void blockstore_impl_t::calc_lengths()
|
||||
{
|
||||
dsk.calc_lengths();
|
||||
journal.len = dsk.journal_len;
|
||||
journal.block_size = dsk.journal_block_size;
|
||||
journal.offset = dsk.journal_offset;
|
||||
if (inmemory_meta)
|
||||
{
|
||||
metadata_buffer = memalign(MEM_ALIGNMENT, dsk.meta_len);
|
||||
if (!metadata_buffer)
|
||||
throw std::runtime_error("Failed to allocate memory for the metadata ("+std::to_string(dsk.meta_len/1024/1024)+" MB)");
|
||||
}
|
||||
else if (dsk.clean_entry_bitmap_size || dsk.data_csum_type)
|
||||
{
|
||||
clean_bitmaps = (uint8_t*)malloc(dsk.block_count * 2 * dsk.clean_entry_bitmap_size);
|
||||
if (!clean_bitmaps)
|
||||
{
|
||||
throw std::runtime_error(
|
||||
"Failed to allocate memory for the metadata sparse write bitmap ("+
|
||||
std::to_string(dsk.block_count * 2 * dsk.clean_entry_bitmap_size / 1024 / 1024)+" MB)"
|
||||
);
|
||||
}
|
||||
}
|
||||
if (journal.inmemory)
|
||||
{
|
||||
journal.buffer = memalign(MEM_ALIGNMENT, journal.len);
|
||||
if (!journal.buffer)
|
||||
throw std::runtime_error("Failed to allocate memory for journal ("+std::to_string(journal.len/1024/1024)+" MB)");
|
||||
}
|
||||
else
|
||||
{
|
||||
journal.sector_buf = (uint8_t*)memalign(MEM_ALIGNMENT, journal.sector_count * dsk.journal_block_size);
|
||||
if (!journal.sector_buf)
|
||||
throw std::bad_alloc();
|
||||
}
|
||||
journal.sector_info = (journal_sector_info_t*)calloc(journal.sector_count, sizeof(journal_sector_info_t));
|
||||
if (!journal.sector_info)
|
||||
{
|
||||
throw std::bad_alloc();
|
||||
}
|
||||
}
|
||||
|
File diff suppressed because it is too large
Load Diff
@@ -1,258 +0,0 @@
|
||||
// Copyright (c) Vitaliy Filippov, 2019+
|
||||
// License: VNPL-1.1 (see README.md for details)
|
||||
|
||||
#include "blockstore_impl.h"
|
||||
|
||||
int blockstore_impl_t::dequeue_rollback(blockstore_op_t *op)
|
||||
{
|
||||
if (PRIV(op)->op_state)
|
||||
{
|
||||
return continue_rollback(op);
|
||||
}
|
||||
int r = split_stab_op(op, [this](obj_ver_id ov)
|
||||
{
|
||||
// Check that there are some versions greater than v->version (which may be zero),
|
||||
// check that they're unstable, synced, and not currently written to
|
||||
auto dirty_it = dirty_db.lower_bound((obj_ver_id){
|
||||
.oid = ov.oid,
|
||||
.version = UINT64_MAX,
|
||||
});
|
||||
if (dirty_it == dirty_db.begin())
|
||||
{
|
||||
// Already rolled back, skip this object version
|
||||
return STAB_SPLIT_DONE;
|
||||
}
|
||||
else
|
||||
{
|
||||
dirty_it--;
|
||||
if (dirty_it->first.oid != ov.oid || dirty_it->first.version < ov.version)
|
||||
{
|
||||
// Already rolled back, skip this object version
|
||||
return STAB_SPLIT_DONE;
|
||||
}
|
||||
while (dirty_it->first.oid == ov.oid && dirty_it->first.version > ov.version)
|
||||
{
|
||||
if (IS_IN_FLIGHT(dirty_it->second.state))
|
||||
{
|
||||
// Object write is still in progress. Wait until the write request completes
|
||||
return STAB_SPLIT_WAIT;
|
||||
}
|
||||
else if (!IS_SYNCED(dirty_it->second.state) ||
|
||||
IS_STABLE(dirty_it->second.state))
|
||||
{
|
||||
// Sync the object
|
||||
return STAB_SPLIT_SYNC;
|
||||
}
|
||||
if (dirty_it == dirty_db.begin())
|
||||
{
|
||||
break;
|
||||
}
|
||||
dirty_it--;
|
||||
}
|
||||
return STAB_SPLIT_TODO;
|
||||
}
|
||||
});
|
||||
if (r != 1)
|
||||
{
|
||||
return r;
|
||||
}
|
||||
// Check journal space
|
||||
blockstore_journal_check_t space_check(this);
|
||||
if (!space_check.check_available(op, op->len, sizeof(journal_entry_rollback), 0))
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
// There is sufficient space. Check SQEs
|
||||
BS_SUBMIT_CHECK_SQES(space_check.sectors_to_write);
|
||||
// Prepare and submit journal entries
|
||||
int s = 0;
|
||||
auto v = (obj_ver_id*)op->buf;
|
||||
for (int i = 0; i < op->len; i++, v++)
|
||||
{
|
||||
if (!journal.entry_fits(sizeof(journal_entry_rollback)) &&
|
||||
journal.sector_info[journal.cur_sector].dirty)
|
||||
{
|
||||
prepare_journal_sector_write(journal.cur_sector, op);
|
||||
s++;
|
||||
}
|
||||
journal_entry_rollback *je = (journal_entry_rollback*)
|
||||
prefill_single_journal_entry(journal, JE_ROLLBACK, sizeof(journal_entry_rollback));
|
||||
je->oid = v->oid;
|
||||
je->version = v->version;
|
||||
je->crc32 = je_crc32((journal_entry*)je);
|
||||
journal.crc32_last = je->crc32;
|
||||
}
|
||||
prepare_journal_sector_write(journal.cur_sector, op);
|
||||
s++;
|
||||
assert(s == space_check.sectors_to_write);
|
||||
PRIV(op)->op_state = 1;
|
||||
return 1;
|
||||
}
|
||||
|
||||
int blockstore_impl_t::continue_rollback(blockstore_op_t *op)
|
||||
{
|
||||
if (PRIV(op)->op_state == 2)
|
||||
goto resume_2;
|
||||
else if (PRIV(op)->op_state == 4)
|
||||
goto resume_4;
|
||||
else
|
||||
return 1;
|
||||
resume_2:
|
||||
if (!disable_journal_fsync)
|
||||
{
|
||||
BS_SUBMIT_GET_SQE(sqe, data);
|
||||
io_uring_prep_fsync(sqe, dsk.journal_fd, IORING_FSYNC_DATASYNC);
|
||||
data->iov = { 0 };
|
||||
data->callback = [this, op](ring_data_t *data) { handle_write_event(data, op); };
|
||||
PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 0;
|
||||
PRIV(op)->pending_ops = 1;
|
||||
PRIV(op)->op_state = 3;
|
||||
return 1;
|
||||
}
|
||||
resume_4:
|
||||
obj_ver_id* v;
|
||||
int i;
|
||||
for (i = 0, v = (obj_ver_id*)op->buf; i < op->len; i++, v++)
|
||||
{
|
||||
mark_rolled_back(*v);
|
||||
}
|
||||
// Acknowledge op
|
||||
op->retval = 0;
|
||||
FINISH_OP(op);
|
||||
return 2;
|
||||
}
|
||||
|
||||
void blockstore_impl_t::mark_rolled_back(const obj_ver_id & ov)
|
||||
{
|
||||
auto it = dirty_db.lower_bound((obj_ver_id){
|
||||
.oid = ov.oid,
|
||||
.version = UINT64_MAX,
|
||||
});
|
||||
if (it != dirty_db.begin())
|
||||
{
|
||||
uint64_t max_unstable = 0;
|
||||
auto rm_start = it;
|
||||
auto rm_end = it;
|
||||
it--;
|
||||
while (1)
|
||||
{
|
||||
if (it->first.oid != ov.oid)
|
||||
break;
|
||||
else if (it->first.version <= ov.version)
|
||||
{
|
||||
if (!IS_STABLE(it->second.state))
|
||||
max_unstable = it->first.version;
|
||||
break;
|
||||
}
|
||||
else if (IS_IN_FLIGHT(it->second.state) || IS_STABLE(it->second.state))
|
||||
break;
|
||||
// Remove entry
|
||||
rm_start = it;
|
||||
if (it == dirty_db.begin())
|
||||
break;
|
||||
it--;
|
||||
}
|
||||
if (rm_start != rm_end)
|
||||
{
|
||||
erase_dirty(rm_start, rm_end, UINT64_MAX);
|
||||
auto unstab_it = unstable_writes.find(ov.oid);
|
||||
if (unstab_it != unstable_writes.end())
|
||||
{
|
||||
if (max_unstable == 0)
|
||||
unstable_writes.erase(unstab_it);
|
||||
else
|
||||
unstab_it->second = max_unstable;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void blockstore_impl_t::erase_dirty(blockstore_dirty_db_t::iterator dirty_start, blockstore_dirty_db_t::iterator dirty_end, uint64_t clean_loc)
|
||||
{
|
||||
if (dirty_end == dirty_start)
|
||||
{
|
||||
return;
|
||||
}
|
||||
auto dirty_it = dirty_end;
|
||||
dirty_it--;
|
||||
if (IS_DELETE(dirty_it->second.state))
|
||||
{
|
||||
object_id oid = dirty_it->first.oid;
|
||||
#ifdef BLOCKSTORE_DEBUG
|
||||
printf("Unblock writes-after-delete %jx:%jx v%ju\n", oid.inode, oid.stripe, dirty_it->first.version);
|
||||
#endif
|
||||
dirty_it = dirty_end;
|
||||
// Unblock operations blocked by delete flushing
|
||||
uint32_t next_state = BS_ST_IN_FLIGHT;
|
||||
while (dirty_it != dirty_db.end() && dirty_it->first.oid == oid)
|
||||
{
|
||||
if ((dirty_it->second.state & BS_ST_WORKFLOW_MASK) == BS_ST_WAIT_DEL)
|
||||
{
|
||||
dirty_it->second.state = (dirty_it->second.state & ~BS_ST_WORKFLOW_MASK) | next_state;
|
||||
if (IS_BIG_WRITE(dirty_it->second.state))
|
||||
{
|
||||
next_state = BS_ST_WAIT_BIG;
|
||||
}
|
||||
}
|
||||
dirty_it++;
|
||||
}
|
||||
dirty_it = dirty_end;
|
||||
dirty_it--;
|
||||
}
|
||||
while (1)
|
||||
{
|
||||
if ((IS_BIG_WRITE(dirty_it->second.state) || IS_DELETE(dirty_it->second.state)) &&
|
||||
IS_STABLE(dirty_it->second.state))
|
||||
{
|
||||
big_to_flush--;
|
||||
}
|
||||
if (IS_BIG_WRITE(dirty_it->second.state) && dirty_it->second.location != clean_loc &&
|
||||
dirty_it->second.location != UINT64_MAX)
|
||||
{
|
||||
#ifdef BLOCKSTORE_DEBUG
|
||||
printf("Free block %ju from %jx:%jx v%ju\n", dirty_it->second.location >> dsk.block_order,
|
||||
dirty_it->first.oid.inode, dirty_it->first.oid.stripe, dirty_it->first.version);
|
||||
#endif
|
||||
data_alloc->set(dirty_it->second.location >> dsk.block_order, false);
|
||||
}
|
||||
auto used = --journal.used_sectors.at(dirty_it->second.journal_sector);
|
||||
#ifdef BLOCKSTORE_DEBUG
|
||||
printf(
|
||||
"remove usage of journal offset %08jx by %jx:%jx v%ju (%ju refs)\n", dirty_it->second.journal_sector,
|
||||
dirty_it->first.oid.inode, dirty_it->first.oid.stripe, dirty_it->first.version, used
|
||||
);
|
||||
#endif
|
||||
if (used == 0)
|
||||
{
|
||||
journal.used_sectors.erase(dirty_it->second.journal_sector);
|
||||
if (dirty_it->second.journal_sector == journal.sector_info[journal.cur_sector].offset)
|
||||
{
|
||||
// Mark current sector as "full" to select the new one
|
||||
journal.in_sector_pos = dsk.journal_block_size;
|
||||
}
|
||||
flusher->mark_trim_possible();
|
||||
}
|
||||
free_dirty_dyn_data(dirty_it->second);
|
||||
if (dirty_it == dirty_start)
|
||||
{
|
||||
break;
|
||||
}
|
||||
dirty_it--;
|
||||
}
|
||||
dirty_db.erase(dirty_start, dirty_end);
|
||||
}
|
||||
|
||||
void blockstore_impl_t::free_dirty_dyn_data(dirty_entry & e)
|
||||
{
|
||||
if (e.dyn_data)
|
||||
{
|
||||
if (alloc_dyn_data &&
|
||||
--*((int*)e.dyn_data) == 0) // refcount
|
||||
{
|
||||
// dyn_data contains the bitmap and checksums
|
||||
// free it if it doesn't refer to the in-memory journal
|
||||
free(e.dyn_data);
|
||||
}
|
||||
e.dyn_data = NULL;
|
||||
}
|
||||
}
|
@@ -3,559 +3,87 @@
|
||||
|
||||
#include "blockstore_impl.h"
|
||||
|
||||
// Stabilize small write:
|
||||
// 1) Copy data from the journal to the data device
|
||||
// 2) Increase version on the metadata device and sync it
|
||||
// 3) Advance clean_db entry's version, clear previous journal entries
|
||||
//
|
||||
// This makes 1 4K small write+sync look like:
|
||||
// 512b+4K (journal) + sync + 512b (journal) + sync + 4K (data) [+ sync?] + 512b (metadata) + sync.
|
||||
// WA = 2.375. It's not the best, SSD FTL-like redirect-write could probably be lower
|
||||
// even with defragmentation. But it's fixed and it's still better than in Ceph. :)
|
||||
// except for HDD-only clusters, because each write results in 3 seeks.
|
||||
|
||||
// Stabilize big write:
|
||||
// 1) Copy metadata from the journal to the metadata device
|
||||
// 2) Move dirty_db entry to clean_db and clear previous journal entries
|
||||
//
|
||||
// This makes 1 128K big write+sync look like:
|
||||
// 128K (data) + sync + 512b (journal) + sync + 512b (journal) + sync + 512b (metadata) + sync.
|
||||
// WA = 1.012. Very good :)
|
||||
|
||||
// Stabilize delete:
|
||||
// 1) Remove metadata entry and sync it
|
||||
// 2) Remove dirty_db entry and clear previous journal entries
|
||||
// We have 2 problems here:
|
||||
// - In the cluster environment, we must store the "tombstones" of deleted objects until
|
||||
// all replicas (not just quorum) agrees about their deletion. That is, "stabilize" is
|
||||
// not possible for deletes in degraded placement groups
|
||||
// - With simple "fixed" metadata tables we can't just clear the metadata entry of the latest
|
||||
// object version. We must clear all previous entries, too.
|
||||
// FIXME Fix both problems - probably, by switching from "fixed" metadata tables to "dynamic"
|
||||
|
||||
// AND We must do it in batches, for the sake of reduced fsync call count
|
||||
// AND We must know what we stabilize. Basic workflow is like:
|
||||
// 1) primary OSD receives sync request
|
||||
// 2) it submits syncs to blockstore and peers
|
||||
// 3) after everyone acks sync it acks sync to the client
|
||||
// 4) after a while it takes his synced object list and sends stabilize requests
|
||||
// to peers and to its own blockstore, thus freeing the old version
|
||||
|
||||
struct ver_vector_t
|
||||
{
|
||||
obj_ver_id *items = NULL;
|
||||
uint64_t alloc = 0, size = 0;
|
||||
};
|
||||
|
||||
static void init_versions(ver_vector_t & vec, obj_ver_id *start, obj_ver_id *end, uint64_t len)
|
||||
{
|
||||
if (!vec.items)
|
||||
{
|
||||
vec.alloc = len;
|
||||
vec.items = (obj_ver_id*)malloc_or_die(sizeof(obj_ver_id) * vec.alloc);
|
||||
for (auto sv = start; sv < end; sv++)
|
||||
{
|
||||
vec.items[vec.size++] = *sv;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void append_version(ver_vector_t & vec, obj_ver_id ov)
|
||||
{
|
||||
if (vec.size >= vec.alloc)
|
||||
{
|
||||
vec.alloc = !vec.alloc ? 4 : vec.alloc*2;
|
||||
vec.items = (obj_ver_id*)realloc_or_die(vec.items, sizeof(obj_ver_id) * vec.alloc);
|
||||
}
|
||||
vec.items[vec.size++] = ov;
|
||||
}
|
||||
|
||||
static bool check_unsynced(std::vector<obj_ver_id> & check, obj_ver_id ov, std::vector<obj_ver_id> & to, int *count)
|
||||
{
|
||||
bool found = false;
|
||||
int j = 0, k = 0;
|
||||
while (j < check.size())
|
||||
{
|
||||
if (check[j] == ov)
|
||||
found = true;
|
||||
if (check[j].oid == ov.oid && check[j].version <= ov.version)
|
||||
{
|
||||
to.push_back(check[j++]);
|
||||
if (count)
|
||||
(*count)--;
|
||||
}
|
||||
else
|
||||
check[k++] = check[j++];
|
||||
}
|
||||
check.resize(k);
|
||||
return found;
|
||||
}
|
||||
|
||||
blockstore_op_t* blockstore_impl_t::selective_sync(blockstore_op_t *op)
|
||||
{
|
||||
unsynced_big_write_count -= unsynced_big_writes.size();
|
||||
unsynced_big_writes.swap(PRIV(op)->sync_big_writes);
|
||||
unsynced_big_write_count += unsynced_big_writes.size();
|
||||
unsynced_small_writes.swap(PRIV(op)->sync_small_writes);
|
||||
// Create a sync operation, insert into the end of the queue
|
||||
// And move ourselves into the end too!
|
||||
// Rather hacky but that's what we need...
|
||||
blockstore_op_t *sync_op = new blockstore_op_t;
|
||||
sync_op->opcode = BS_OP_SYNC;
|
||||
sync_op->buf = NULL;
|
||||
sync_op->callback = [](blockstore_op_t *sync_op)
|
||||
{
|
||||
delete sync_op;
|
||||
};
|
||||
init_op(sync_op);
|
||||
int sync_res = continue_sync(sync_op);
|
||||
if (sync_res != 2)
|
||||
{
|
||||
// Put SYNC into the queue if it's not finished yet
|
||||
submit_queue.push_back(sync_op);
|
||||
}
|
||||
// Restore unsynced_writes
|
||||
unsynced_small_writes.swap(PRIV(op)->sync_small_writes);
|
||||
unsynced_big_write_count -= unsynced_big_writes.size();
|
||||
unsynced_big_writes.swap(PRIV(op)->sync_big_writes);
|
||||
unsynced_big_write_count += unsynced_big_writes.size();
|
||||
if (sync_res == 2)
|
||||
{
|
||||
// Sync is immediately completed
|
||||
return NULL;
|
||||
}
|
||||
return sync_op;
|
||||
}
|
||||
|
||||
// Returns: 2 = stop processing and dequeue, 0 = stop processing and do not dequeue, 1 = proceed with op itself
|
||||
int blockstore_impl_t::split_stab_op(blockstore_op_t *op, std::function<int(obj_ver_id v)> decider)
|
||||
{
|
||||
bool add_sync = false;
|
||||
ver_vector_t good_vers, bad_vers;
|
||||
obj_ver_id* v;
|
||||
int i, todo = 0;
|
||||
for (i = 0, v = (obj_ver_id*)op->buf; i < op->len; i++, v++)
|
||||
{
|
||||
int action = decider(*v);
|
||||
if (action < 0)
|
||||
{
|
||||
// Rollback changes
|
||||
for (auto & ov: PRIV(op)->sync_big_writes)
|
||||
{
|
||||
unsynced_big_writes.push_back(ov);
|
||||
unsynced_big_write_count++;
|
||||
}
|
||||
for (auto & ov: PRIV(op)->sync_small_writes)
|
||||
{
|
||||
unsynced_small_writes.push_back(ov);
|
||||
}
|
||||
free(good_vers.items);
|
||||
good_vers.items = NULL;
|
||||
free(bad_vers.items);
|
||||
bad_vers.items = NULL;
|
||||
// Error
|
||||
op->retval = action;
|
||||
FINISH_OP(op);
|
||||
return 2;
|
||||
}
|
||||
else if (action == STAB_SPLIT_DONE)
|
||||
{
|
||||
// Already done
|
||||
init_versions(good_vers, (obj_ver_id*)op->buf, v, op->len);
|
||||
}
|
||||
else if (action == STAB_SPLIT_WAIT)
|
||||
{
|
||||
// Already in progress, we just have to wait until it finishes
|
||||
init_versions(good_vers, (obj_ver_id*)op->buf, v, op->len);
|
||||
append_version(bad_vers, *v);
|
||||
}
|
||||
else if (action == STAB_SPLIT_SYNC)
|
||||
{
|
||||
// Needs a SYNC, we have to send a SYNC if not already in progress
|
||||
//
|
||||
// If the object is not present in unsynced_(big|small)_writes then
|
||||
// it's currently being synced. If it's present then we can initiate
|
||||
// its sync ourselves.
|
||||
init_versions(good_vers, (obj_ver_id*)op->buf, v, op->len);
|
||||
append_version(bad_vers, *v);
|
||||
if (!add_sync)
|
||||
{
|
||||
PRIV(op)->sync_big_writes.clear();
|
||||
PRIV(op)->sync_small_writes.clear();
|
||||
add_sync = true;
|
||||
}
|
||||
check_unsynced(unsynced_small_writes, *v, PRIV(op)->sync_small_writes, NULL);
|
||||
check_unsynced(unsynced_big_writes, *v, PRIV(op)->sync_big_writes, &unsynced_big_write_count);
|
||||
}
|
||||
else /* if (action == STAB_SPLIT_TODO) */
|
||||
{
|
||||
if (good_vers.items)
|
||||
{
|
||||
// If we're selecting versions then append it
|
||||
// Main idea is that 99% of the time all versions passed to BS_OP_STABLE are synced
|
||||
// And we don't want to select/allocate anything in that optimistic case
|
||||
append_version(good_vers, *v);
|
||||
}
|
||||
todo++;
|
||||
}
|
||||
}
|
||||
// In a pessimistic scenario, an operation may be split into 3:
|
||||
// - Stabilize synced entries
|
||||
// - Sync unsynced entries
|
||||
// - Continue for unsynced entries after sync
|
||||
add_sync = add_sync && (PRIV(op)->sync_big_writes.size() || PRIV(op)->sync_small_writes.size());
|
||||
if (!todo && !bad_vers.size)
|
||||
{
|
||||
// Already stable
|
||||
op->retval = 0;
|
||||
FINISH_OP(op);
|
||||
return 2;
|
||||
}
|
||||
op->retval = 0;
|
||||
if (!todo && !add_sync)
|
||||
{
|
||||
// Only wait for inflight writes or current in-progress syncs
|
||||
return 0;
|
||||
}
|
||||
blockstore_op_t *sync_op = NULL, *split_stab_op = NULL;
|
||||
if (add_sync)
|
||||
{
|
||||
// Initiate a selective sync for PRIV(op)->sync_(big|small)_writes
|
||||
sync_op = selective_sync(op);
|
||||
}
|
||||
if (bad_vers.size)
|
||||
{
|
||||
// Split part of the request into a separate operation
|
||||
split_stab_op = new blockstore_op_t;
|
||||
split_stab_op->opcode = op->opcode;
|
||||
split_stab_op->buf = bad_vers.items;
|
||||
split_stab_op->len = bad_vers.size;
|
||||
init_op(split_stab_op);
|
||||
submit_queue.push_back(split_stab_op);
|
||||
}
|
||||
if (sync_op || split_stab_op || good_vers.items)
|
||||
{
|
||||
void *orig_buf = op->buf;
|
||||
if (good_vers.items)
|
||||
{
|
||||
op->buf = good_vers.items;
|
||||
op->len = good_vers.size;
|
||||
}
|
||||
// Make a wrapped callback
|
||||
int *split_op_counter = (int*)malloc_or_die(sizeof(int));
|
||||
*split_op_counter = (sync_op ? 1 : 0) + (split_stab_op ? 1 : 0) + (todo ? 1 : 0);
|
||||
auto cb = [op, good_items = good_vers.items,
|
||||
bad_items = bad_vers.items, split_op_counter,
|
||||
orig_buf, real_cb = op->callback](blockstore_op_t *split_op)
|
||||
{
|
||||
if (split_op->retval != 0)
|
||||
op->retval = split_op->retval;
|
||||
(*split_op_counter)--;
|
||||
assert((*split_op_counter) >= 0);
|
||||
if (op != split_op)
|
||||
delete split_op;
|
||||
if (!*split_op_counter)
|
||||
{
|
||||
free(good_items);
|
||||
free(bad_items);
|
||||
free(split_op_counter);
|
||||
op->buf = orig_buf;
|
||||
real_cb(op);
|
||||
}
|
||||
};
|
||||
if (sync_op)
|
||||
{
|
||||
sync_op->callback = cb;
|
||||
}
|
||||
if (split_stab_op)
|
||||
{
|
||||
split_stab_op->callback = cb;
|
||||
}
|
||||
op->callback = cb;
|
||||
}
|
||||
if (!todo)
|
||||
{
|
||||
// All work is postponed
|
||||
op->callback = NULL;
|
||||
return 2;
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
// Handles both stabilize (commit) and rollback
|
||||
int blockstore_impl_t::dequeue_stable(blockstore_op_t *op)
|
||||
{
|
||||
if (PRIV(op)->op_state)
|
||||
{
|
||||
return continue_stable(op);
|
||||
}
|
||||
int r = split_stab_op(op, [this](obj_ver_id ov)
|
||||
{
|
||||
auto dirty_it = dirty_db.find(ov);
|
||||
if (dirty_it == dirty_db.end())
|
||||
{
|
||||
auto & clean_db = clean_db_shard(ov.oid);
|
||||
auto clean_it = clean_db.find(ov.oid);
|
||||
if (clean_it == clean_db.end() || clean_it->second.version < ov.version)
|
||||
{
|
||||
// No such object version
|
||||
printf("Error: %jx:%jx v%ju not found while stabilizing\n", ov.oid.inode, ov.oid.stripe, ov.version);
|
||||
return -ENOENT;
|
||||
}
|
||||
else
|
||||
{
|
||||
// Already stable
|
||||
return STAB_SPLIT_DONE;
|
||||
}
|
||||
}
|
||||
else if (IS_STABLE(dirty_it->second.state))
|
||||
{
|
||||
// Already stable
|
||||
return STAB_SPLIT_DONE;
|
||||
}
|
||||
while (true)
|
||||
{
|
||||
if (IS_IN_FLIGHT(dirty_it->second.state))
|
||||
{
|
||||
// Object write is still in progress. Wait until the write request completes
|
||||
return STAB_SPLIT_WAIT;
|
||||
}
|
||||
else if (!IS_SYNCED(dirty_it->second.state))
|
||||
{
|
||||
// Object not synced yet - sync it
|
||||
// In previous versions we returned EBUSY here and required
|
||||
// the caller (OSD) to issue a global sync first. But a global sync
|
||||
// waits for all writes in the queue including inflight writes. And
|
||||
// inflight writes may themselves be blocked by unstable writes being
|
||||
// still present in the journal and not flushed away from it.
|
||||
// So we must sync specific objects here.
|
||||
//
|
||||
// Even more, we have to process "stabilize" request in parts. That is,
|
||||
// we must stabilize all objects which are already synced. Otherwise
|
||||
// they may block objects which are NOT synced yet.
|
||||
return STAB_SPLIT_SYNC;
|
||||
}
|
||||
else if (IS_STABLE(dirty_it->second.state))
|
||||
{
|
||||
break;
|
||||
}
|
||||
// Check previous versions too
|
||||
if (dirty_it == dirty_db.begin())
|
||||
{
|
||||
break;
|
||||
}
|
||||
dirty_it--;
|
||||
if (dirty_it->first.oid != ov.oid)
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
return STAB_SPLIT_TODO;
|
||||
});
|
||||
if (r != 1)
|
||||
{
|
||||
return r;
|
||||
}
|
||||
// Check journal space
|
||||
blockstore_journal_check_t space_check(this);
|
||||
if (!space_check.check_available(op, op->len, sizeof(journal_entry_stable), 0))
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
// There is sufficient space. Check SQEs
|
||||
BS_SUBMIT_CHECK_SQES(space_check.sectors_to_write);
|
||||
// Prepare and submit journal entries
|
||||
int s = 0;
|
||||
auto v = (obj_ver_id*)op->buf;
|
||||
for (int i = 0; i < op->len; i++, v++)
|
||||
{
|
||||
if (!journal.entry_fits(sizeof(journal_entry_stable)) &&
|
||||
journal.sector_info[journal.cur_sector].dirty)
|
||||
{
|
||||
prepare_journal_sector_write(journal.cur_sector, op);
|
||||
s++;
|
||||
}
|
||||
journal_entry_stable *je = (journal_entry_stable*)
|
||||
prefill_single_journal_entry(journal, JE_STABLE, sizeof(journal_entry_stable));
|
||||
je->oid = v->oid;
|
||||
je->version = v->version;
|
||||
je->crc32 = je_crc32((journal_entry*)je);
|
||||
journal.crc32_last = je->crc32;
|
||||
}
|
||||
prepare_journal_sector_write(journal.cur_sector, op);
|
||||
s++;
|
||||
assert(s == space_check.sectors_to_write);
|
||||
PRIV(op)->op_state = 1;
|
||||
return 1;
|
||||
}
|
||||
|
||||
int blockstore_impl_t::continue_stable(blockstore_op_t *op)
|
||||
{
|
||||
if (PRIV(op)->op_state == 2)
|
||||
goto resume_2;
|
||||
else if (PRIV(op)->op_state == 4)
|
||||
goto resume_4;
|
||||
else
|
||||
return 1;
|
||||
resume_2:
|
||||
if (!disable_journal_fsync)
|
||||
{
|
||||
BS_SUBMIT_GET_SQE(sqe, data);
|
||||
io_uring_prep_fsync(sqe, dsk.journal_fd, IORING_FSYNC_DATASYNC);
|
||||
data->iov = { 0 };
|
||||
data->callback = [this, op](ring_data_t *data) { handle_write_event(data, op); };
|
||||
PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 0;
|
||||
PRIV(op)->pending_ops = 1;
|
||||
PRIV(op)->op_state = 3;
|
||||
return 1;
|
||||
}
|
||||
resume_4:
|
||||
// Mark dirty_db entries as stable, acknowledge op completion
|
||||
obj_ver_id* v;
|
||||
int i;
|
||||
for (i = 0, v = (obj_ver_id*)op->buf; i < op->len; i++, v++)
|
||||
{
|
||||
// Mark all dirty_db entries up to op->version as stable
|
||||
#ifdef BLOCKSTORE_DEBUG
|
||||
printf("Stabilize %jx:%jx v%ju\n", v->oid.inode, v->oid.stripe, v->version);
|
||||
#endif
|
||||
mark_stable(*v);
|
||||
}
|
||||
// Acknowledge op
|
||||
obj_ver_id *v = (obj_ver_id*)op->buf;
|
||||
auto priv = PRIV(op);
|
||||
if (priv->op_state == 1) goto resume_1;
|
||||
else if (priv->op_state == 2) goto resume_2;
|
||||
else if (priv->op_state == 3) goto resume_3;
|
||||
else if (priv->op_state == 4) goto resume_4;
|
||||
assert(!priv->op_state);
|
||||
// Modify in-memory state and assign contiguous LSNs
|
||||
priv->stab_pos = 0;
|
||||
priv->lsn = priv->to_lsn = 0;
|
||||
op->retval = 0;
|
||||
while (priv->stab_pos < op->len)
|
||||
{
|
||||
uint32_t modified_block = 0;
|
||||
uint64_t new_lsn = 0;
|
||||
uint64_t new_to_lsn = 0;
|
||||
int res = op->opcode == BS_OP_STABLE
|
||||
? heap->post_stabilize(v[priv->stab_pos].oid, v[priv->stab_pos].version, &modified_block, &new_lsn, &new_to_lsn)
|
||||
: heap->post_rollback(v[priv->stab_pos].oid, v[priv->stab_pos].version, &new_lsn, &modified_block);
|
||||
if (res != 0)
|
||||
{
|
||||
assert(res == ENOENT || res == EBUSY);
|
||||
op->retval = -res;
|
||||
}
|
||||
if (new_lsn)
|
||||
{
|
||||
assert(priv->lsn == 0 || priv->to_lsn == new_lsn-1);
|
||||
if (!priv->lsn)
|
||||
priv->lsn = new_lsn;
|
||||
priv->to_lsn = op->opcode == BS_OP_STABLE ? new_to_lsn : new_lsn;
|
||||
}
|
||||
priv->stab_pos++;
|
||||
}
|
||||
// Submit metadata writes
|
||||
priv->stab_pos = 0;
|
||||
resume_1:
|
||||
priv->op_state = 1;
|
||||
while (priv->stab_pos < op->len)
|
||||
{
|
||||
uint32_t block_num = 0;
|
||||
heap_object_t *obj = heap->read_entry(v[priv->stab_pos].oid, &block_num);
|
||||
if (obj)
|
||||
{
|
||||
io_uring_sqe *sqe = get_sqe();
|
||||
if (!sqe)
|
||||
{
|
||||
if (priv->pending_ops > 0)
|
||||
return 1;
|
||||
priv->wait_detail = 1;
|
||||
priv->wait_for = WAIT_SQE;
|
||||
return 0;
|
||||
}
|
||||
prepare_meta_block_write(op, block_num, sqe);
|
||||
}
|
||||
priv->stab_pos++;
|
||||
}
|
||||
if (priv->pending_ops > 0)
|
||||
{
|
||||
priv->op_state = 1;
|
||||
return 1;
|
||||
}
|
||||
// Mark writes as completed to allow compaction
|
||||
for (uint64_t lsn = priv->lsn; lsn <= priv->to_lsn; lsn++)
|
||||
{
|
||||
heap->mark_lsn_completed(lsn);
|
||||
}
|
||||
unsynced_meta_write_count++;
|
||||
// Fsync, just because our semantics imply that commit (stabilize) is immediately fsynced
|
||||
priv->op_state = 2;
|
||||
resume_2:
|
||||
resume_3:
|
||||
resume_4:
|
||||
int res = do_sync(op, 2);
|
||||
if (res != 2)
|
||||
{
|
||||
return res;
|
||||
}
|
||||
// Done. Don't touch op->retval - if anything resulted in ENOENT, return it as is
|
||||
FINISH_OP(op);
|
||||
return 2;
|
||||
}
|
||||
|
||||
void blockstore_impl_t::mark_stable(obj_ver_id v, bool forget_dirty)
|
||||
{
|
||||
auto dirty_it = dirty_db.find(v);
|
||||
if (dirty_it != dirty_db.end())
|
||||
{
|
||||
if (IS_INSTANT(dirty_it->second.state))
|
||||
{
|
||||
// 'Instant' (non-EC) operations may complete and try to become stable out of order. Prevent it.
|
||||
auto back_it = dirty_it;
|
||||
while (back_it != dirty_db.begin())
|
||||
{
|
||||
back_it--;
|
||||
if (back_it->first.oid != v.oid)
|
||||
{
|
||||
break;
|
||||
}
|
||||
if (!IS_STABLE(back_it->second.state))
|
||||
{
|
||||
// There are preceding unstable versions, can't flush <v>
|
||||
return;
|
||||
}
|
||||
}
|
||||
while (true)
|
||||
{
|
||||
dirty_it++;
|
||||
if (dirty_it == dirty_db.end() || dirty_it->first.oid != v.oid ||
|
||||
!IS_SYNCED(dirty_it->second.state))
|
||||
{
|
||||
dirty_it--;
|
||||
break;
|
||||
}
|
||||
v.version = dirty_it->first.version;
|
||||
}
|
||||
}
|
||||
while (1)
|
||||
{
|
||||
bool was_stable = IS_STABLE(dirty_it->second.state);
|
||||
if ((dirty_it->second.state & BS_ST_WORKFLOW_MASK) == BS_ST_SYNCED)
|
||||
{
|
||||
dirty_it->second.state = (dirty_it->second.state & ~BS_ST_WORKFLOW_MASK) | BS_ST_STABLE;
|
||||
// Allocations and deletions are counted when they're stabilized
|
||||
if (IS_BIG_WRITE(dirty_it->second.state))
|
||||
{
|
||||
int exists = -1;
|
||||
if (dirty_it != dirty_db.begin())
|
||||
{
|
||||
auto prev_it = dirty_it;
|
||||
prev_it--;
|
||||
if (prev_it->first.oid == v.oid)
|
||||
{
|
||||
exists = IS_DELETE(prev_it->second.state) ? 0 : 1;
|
||||
}
|
||||
}
|
||||
if (exists == -1)
|
||||
{
|
||||
auto & clean_db = clean_db_shard(v.oid);
|
||||
auto clean_it = clean_db.find(v.oid);
|
||||
exists = clean_it != clean_db.end() ? 1 : 0;
|
||||
}
|
||||
if (!exists)
|
||||
{
|
||||
uint64_t space_id = dirty_it->first.oid.inode;
|
||||
if (no_inode_stats[dirty_it->first.oid.inode >> (64-POOL_ID_BITS)])
|
||||
space_id = space_id & ~(((uint64_t)1 << (64-POOL_ID_BITS)) - 1);
|
||||
inode_space_stats[space_id] += dsk.data_block_size;
|
||||
used_blocks++;
|
||||
}
|
||||
big_to_flush++;
|
||||
}
|
||||
else if (IS_DELETE(dirty_it->second.state))
|
||||
{
|
||||
uint64_t space_id = dirty_it->first.oid.inode;
|
||||
if (no_inode_stats[dirty_it->first.oid.inode >> (64-POOL_ID_BITS)])
|
||||
space_id = space_id & ~(((uint64_t)1 << (64-POOL_ID_BITS)) - 1);
|
||||
auto & sp = inode_space_stats[space_id];
|
||||
if (sp > dsk.data_block_size)
|
||||
sp -= dsk.data_block_size;
|
||||
else
|
||||
inode_space_stats.erase(space_id);
|
||||
used_blocks--;
|
||||
big_to_flush++;
|
||||
}
|
||||
}
|
||||
else if (IS_IN_FLIGHT(dirty_it->second.state))
|
||||
{
|
||||
// mark_stable should never be called for in-flight or submitted writes
|
||||
printf(
|
||||
"BUG: Attempt to mark_stable object %jx:%jx v%ju state of which is %x\n",
|
||||
dirty_it->first.oid.inode, dirty_it->first.oid.stripe, dirty_it->first.version,
|
||||
dirty_it->second.state
|
||||
);
|
||||
exit(1);
|
||||
}
|
||||
if (forget_dirty && (IS_BIG_WRITE(dirty_it->second.state) ||
|
||||
IS_DELETE(dirty_it->second.state)))
|
||||
{
|
||||
// Big write overrides all previous dirty entries
|
||||
auto erase_end = dirty_it;
|
||||
while (dirty_it != dirty_db.begin())
|
||||
{
|
||||
dirty_it--;
|
||||
if (dirty_it->first.oid != v.oid)
|
||||
{
|
||||
dirty_it++;
|
||||
break;
|
||||
}
|
||||
}
|
||||
auto & clean_db = clean_db_shard(v.oid);
|
||||
auto clean_it = clean_db.find(v.oid);
|
||||
uint64_t clean_loc = clean_it != clean_db.end()
|
||||
? clean_it->second.location : UINT64_MAX;
|
||||
erase_dirty(dirty_it, erase_end, clean_loc);
|
||||
break;
|
||||
}
|
||||
if (was_stable || dirty_it == dirty_db.begin())
|
||||
{
|
||||
break;
|
||||
}
|
||||
dirty_it--;
|
||||
if (dirty_it->first.oid != v.oid)
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
flusher->enqueue_flush(v);
|
||||
}
|
||||
auto unstab_it = unstable_writes.find(v.oid);
|
||||
if (unstab_it != unstable_writes.end() &&
|
||||
unstab_it->second <= v.version)
|
||||
{
|
||||
unstable_writes.erase(unstab_it);
|
||||
}
|
||||
}
|
||||
|
@@ -3,231 +3,112 @@
|
||||
|
||||
#include "blockstore_impl.h"
|
||||
|
||||
#define SYNC_HAS_SMALL 1
|
||||
#define SYNC_HAS_BIG 2
|
||||
#define SYNC_DATA_SYNC_SENT 3
|
||||
#define SYNC_DATA_SYNC_DONE 4
|
||||
#define SYNC_JOURNAL_WRITE_SENT 5
|
||||
#define SYNC_JOURNAL_WRITE_DONE 6
|
||||
#define SYNC_JOURNAL_SYNC_SENT 7
|
||||
#define SYNC_DONE 8
|
||||
|
||||
int blockstore_impl_t::continue_sync(blockstore_op_t *op)
|
||||
{
|
||||
if (immediate_commit == IMMEDIATE_ALL)
|
||||
if (!PRIV(op)->op_state)
|
||||
{
|
||||
// We can return immediately because sync is only dequeued after all previous writes
|
||||
op->retval = 0;
|
||||
}
|
||||
int res = do_sync(op, 0);
|
||||
if (res == 2)
|
||||
{
|
||||
FINISH_OP(op);
|
||||
return 2;
|
||||
}
|
||||
if (PRIV(op)->op_state == 0)
|
||||
{
|
||||
stop_sync_submitted = false;
|
||||
unsynced_big_write_count -= unsynced_big_writes.size();
|
||||
PRIV(op)->sync_big_writes.swap(unsynced_big_writes);
|
||||
PRIV(op)->sync_small_writes.swap(unsynced_small_writes);
|
||||
unsynced_big_writes.clear();
|
||||
unsynced_small_writes.clear();
|
||||
if (PRIV(op)->sync_big_writes.size() > 0)
|
||||
PRIV(op)->op_state = SYNC_HAS_BIG;
|
||||
else if (PRIV(op)->sync_small_writes.size() > 0)
|
||||
PRIV(op)->op_state = SYNC_HAS_SMALL;
|
||||
else
|
||||
PRIV(op)->op_state = SYNC_DONE;
|
||||
}
|
||||
if (PRIV(op)->op_state == SYNC_HAS_SMALL)
|
||||
{
|
||||
// No big writes, just fsync the journal
|
||||
if (journal.sector_info[journal.cur_sector].dirty)
|
||||
{
|
||||
// Write out the last journal sector if it happens to be dirty
|
||||
BS_SUBMIT_CHECK_SQES(1);
|
||||
prepare_journal_sector_write(journal.cur_sector, op);
|
||||
PRIV(op)->op_state = SYNC_JOURNAL_WRITE_SENT;
|
||||
return 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
PRIV(op)->op_state = SYNC_JOURNAL_WRITE_DONE;
|
||||
}
|
||||
}
|
||||
if (PRIV(op)->op_state == SYNC_HAS_BIG)
|
||||
{
|
||||
// 1st step: fsync data
|
||||
if (!disable_data_fsync)
|
||||
{
|
||||
BS_SUBMIT_GET_SQE(sqe, data);
|
||||
io_uring_prep_fsync(sqe, dsk.data_fd, IORING_FSYNC_DATASYNC);
|
||||
data->iov = { 0 };
|
||||
data->callback = [this, op](ring_data_t *data) { handle_write_event(data, op); };
|
||||
PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 0;
|
||||
PRIV(op)->pending_ops = 1;
|
||||
PRIV(op)->op_state = SYNC_DATA_SYNC_SENT;
|
||||
return 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
PRIV(op)->op_state = SYNC_DATA_SYNC_DONE;
|
||||
}
|
||||
}
|
||||
if (PRIV(op)->op_state == SYNC_DATA_SYNC_DONE)
|
||||
{
|
||||
// 2nd step: Data device is synced, prepare & write journal entries
|
||||
// Check space in the journal and journal memory buffers
|
||||
blockstore_journal_check_t space_check(this);
|
||||
if (dsk.csum_block_size)
|
||||
{
|
||||
// More complex check because all journal entries have different lengths
|
||||
int left = PRIV(op)->sync_big_writes.size();
|
||||
for (auto & sbw: PRIV(op)->sync_big_writes)
|
||||
{
|
||||
left--;
|
||||
auto & dirty_entry = dirty_db.at(sbw);
|
||||
uint64_t dyn_size = dsk.dirty_dyn_size(dirty_entry.offset, dirty_entry.len);
|
||||
if (!space_check.check_available(op, 1, sizeof(journal_entry_big_write) + dyn_size, 0))
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
else if (!space_check.check_available(op, PRIV(op)->sync_big_writes.size(),
|
||||
sizeof(journal_entry_big_write) + dsk.clean_entry_bitmap_size, 0))
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
// Check SQEs. Don't bother about merging, submit each journal sector as a separate request
|
||||
BS_SUBMIT_CHECK_SQES(space_check.sectors_to_write);
|
||||
// Prepare and submit journal entries
|
||||
auto it = PRIV(op)->sync_big_writes.begin();
|
||||
int s = 0;
|
||||
while (it != PRIV(op)->sync_big_writes.end())
|
||||
{
|
||||
auto & dirty_entry = dirty_db.at(*it);
|
||||
uint64_t dyn_size = dsk.dirty_dyn_size(dirty_entry.offset, dirty_entry.len);
|
||||
if (!journal.entry_fits(sizeof(journal_entry_big_write) + dyn_size) &&
|
||||
journal.sector_info[journal.cur_sector].dirty)
|
||||
{
|
||||
prepare_journal_sector_write(journal.cur_sector, op);
|
||||
s++;
|
||||
}
|
||||
journal_entry_big_write *je = (journal_entry_big_write*)prefill_single_journal_entry(
|
||||
journal, (dirty_entry.state & BS_ST_INSTANT) ? JE_BIG_WRITE_INSTANT : JE_BIG_WRITE,
|
||||
sizeof(journal_entry_big_write) + dyn_size
|
||||
);
|
||||
auto jsec = dirty_entry.journal_sector = journal.sector_info[journal.cur_sector].offset;
|
||||
assert(journal.next_free >= journal.used_start
|
||||
? (jsec >= journal.used_start && jsec < journal.next_free)
|
||||
: (jsec >= journal.used_start || jsec < journal.next_free));
|
||||
journal.used_sectors[journal.sector_info[journal.cur_sector].offset]++;
|
||||
#ifdef BLOCKSTORE_DEBUG
|
||||
printf(
|
||||
"journal offset %08jx is used by %jx:%jx v%ju (%ju refs)\n",
|
||||
dirty_entry.journal_sector, it->oid.inode, it->oid.stripe, it->version,
|
||||
journal.used_sectors[journal.sector_info[journal.cur_sector].offset]
|
||||
);
|
||||
#endif
|
||||
je->oid = it->oid;
|
||||
je->version = it->version;
|
||||
je->offset = dirty_entry.offset;
|
||||
je->len = dirty_entry.len;
|
||||
je->location = dirty_entry.location;
|
||||
memcpy((void*)(je+1), (alloc_dyn_data
|
||||
? (uint8_t*)dirty_entry.dyn_data+sizeof(int) : (uint8_t*)&dirty_entry.dyn_data), dyn_size);
|
||||
je->crc32 = je_crc32((journal_entry*)je);
|
||||
journal.crc32_last = je->crc32;
|
||||
it++;
|
||||
}
|
||||
prepare_journal_sector_write(journal.cur_sector, op);
|
||||
s++;
|
||||
assert(s == space_check.sectors_to_write);
|
||||
PRIV(op)->op_state = SYNC_JOURNAL_WRITE_SENT;
|
||||
return 1;
|
||||
}
|
||||
if (PRIV(op)->op_state == SYNC_JOURNAL_WRITE_DONE)
|
||||
{
|
||||
if (!disable_journal_fsync)
|
||||
{
|
||||
BS_SUBMIT_GET_SQE(sqe, data);
|
||||
io_uring_prep_fsync(sqe, dsk.journal_fd, IORING_FSYNC_DATASYNC);
|
||||
data->iov = { 0 };
|
||||
data->callback = [this, op](ring_data_t *data) { handle_write_event(data, op); };
|
||||
PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 0;
|
||||
PRIV(op)->pending_ops = 1;
|
||||
PRIV(op)->op_state = SYNC_JOURNAL_SYNC_SENT;
|
||||
return 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
PRIV(op)->op_state = SYNC_DONE;
|
||||
}
|
||||
}
|
||||
if (PRIV(op)->op_state == SYNC_DONE)
|
||||
{
|
||||
ack_sync(op);
|
||||
return 2;
|
||||
}
|
||||
return 1;
|
||||
return res;
|
||||
}
|
||||
|
||||
void blockstore_impl_t::ack_sync(blockstore_op_t *op)
|
||||
bool blockstore_impl_t::submit_fsyncs(int & wait_count)
|
||||
{
|
||||
// Handle states
|
||||
for (auto it = PRIV(op)->sync_big_writes.begin(); it != PRIV(op)->sync_big_writes.end(); it++)
|
||||
int n = ((unsynced_small_write_count > 0 || unsynced_big_write_count > 0 || unsynced_meta_write_count > 0) && !dsk.disable_meta_fsync) +
|
||||
(unsynced_small_write_count > 0 && !dsk.disable_journal_fsync && dsk.journal_fd != dsk.meta_fd) +
|
||||
(unsynced_big_write_count > 0 && !dsk.disable_data_fsync && dsk.data_fd != dsk.meta_fd && dsk.data_fd != dsk.journal_fd);
|
||||
if (ringloop->space_left() < n)
|
||||
{
|
||||
#ifdef BLOCKSTORE_DEBUG
|
||||
printf("Ack sync big %jx:%jx v%ju\n", it->oid.inode, it->oid.stripe, it->version);
|
||||
#endif
|
||||
auto & unstab = unstable_writes[it->oid];
|
||||
unstab = unstab < it->version ? it->version : unstab;
|
||||
auto dirty_it = dirty_db.find(*it);
|
||||
dirty_it->second.state = ((dirty_it->second.state & ~BS_ST_WORKFLOW_MASK) | BS_ST_SYNCED);
|
||||
if (dirty_it->second.state & BS_ST_INSTANT)
|
||||
{
|
||||
mark_stable(dirty_it->first);
|
||||
}
|
||||
else
|
||||
{
|
||||
unstable_unsynced--;
|
||||
assert(unstable_unsynced >= 0);
|
||||
}
|
||||
dirty_it++;
|
||||
while (dirty_it != dirty_db.end() && dirty_it->first.oid == it->oid)
|
||||
{
|
||||
if ((dirty_it->second.state & BS_ST_WORKFLOW_MASK) == BS_ST_WAIT_BIG)
|
||||
{
|
||||
dirty_it->second.state = (dirty_it->second.state & ~BS_ST_WORKFLOW_MASK) | BS_ST_IN_FLIGHT;
|
||||
}
|
||||
dirty_it++;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
for (auto it = PRIV(op)->sync_small_writes.begin(); it != PRIV(op)->sync_small_writes.end(); it++)
|
||||
if (!n)
|
||||
{
|
||||
#ifdef BLOCKSTORE_DEBUG
|
||||
printf("Ack sync small %jx:%jx v%ju\n", it->oid.inode, it->oid.stripe, it->version);
|
||||
#endif
|
||||
auto & unstab = unstable_writes[it->oid];
|
||||
unstab = unstab < it->version ? it->version : unstab;
|
||||
if (dirty_db[*it].state == (BS_ST_DELETE | BS_ST_WRITTEN))
|
||||
{
|
||||
dirty_db[*it].state = (BS_ST_DELETE | BS_ST_SYNCED);
|
||||
// Deletions are treated as immediately stable
|
||||
mark_stable(*it);
|
||||
}
|
||||
else /* (BS_ST_INSTANT?) | BS_ST_SMALL_WRITE | BS_ST_WRITTEN */
|
||||
{
|
||||
dirty_db[*it].state = (dirty_db[*it].state & ~BS_ST_WORKFLOW_MASK) | BS_ST_SYNCED;
|
||||
if (dirty_db[*it].state & BS_ST_INSTANT)
|
||||
{
|
||||
mark_stable(*it);
|
||||
}
|
||||
else
|
||||
{
|
||||
unstable_unsynced--;
|
||||
assert(unstable_unsynced >= 0);
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
op->retval = 0;
|
||||
FINISH_OP(op);
|
||||
auto cb = [this, & wait_count](ring_data_t *data)
|
||||
{
|
||||
if (data->res != 0)
|
||||
disk_error_abort("sync meta", data->res, 0);
|
||||
wait_count--;
|
||||
assert(wait_count >= 0);
|
||||
if (!wait_count)
|
||||
ringloop->wakeup();
|
||||
};
|
||||
if ((unsynced_small_write_count > 0 || unsynced_big_write_count > 0 || unsynced_meta_write_count > 0) && !dsk.disable_meta_fsync)
|
||||
{
|
||||
// fsync meta
|
||||
io_uring_sqe *sqe = get_sqe();
|
||||
assert(sqe);
|
||||
ring_data_t *data = ((ring_data_t*)sqe->user_data);
|
||||
io_uring_prep_fsync(sqe, dsk.meta_fd, IORING_FSYNC_DATASYNC);
|
||||
data->iov = { 0 };
|
||||
data->callback = cb;
|
||||
wait_count++;
|
||||
}
|
||||
if (unsynced_small_write_count > 0 && !dsk.disable_journal_fsync && dsk.meta_fd != dsk.journal_fd)
|
||||
{
|
||||
// fsync buffer
|
||||
io_uring_sqe *sqe = get_sqe();
|
||||
assert(sqe);
|
||||
ring_data_t *data = ((ring_data_t*)sqe->user_data);
|
||||
io_uring_prep_fsync(sqe, dsk.journal_fd, IORING_FSYNC_DATASYNC);
|
||||
data->iov = { 0 };
|
||||
data->callback = cb;
|
||||
wait_count++;
|
||||
}
|
||||
if (unsynced_big_write_count > 0 && !dsk.disable_data_fsync && dsk.data_fd != dsk.meta_fd && dsk.data_fd != dsk.journal_fd)
|
||||
{
|
||||
// fsync data
|
||||
io_uring_sqe *sqe = get_sqe();
|
||||
assert(sqe);
|
||||
ring_data_t *data = ((ring_data_t*)sqe->user_data);
|
||||
io_uring_prep_fsync(sqe, dsk.data_fd, IORING_FSYNC_DATASYNC);
|
||||
data->iov = { 0 };
|
||||
data->callback = cb;
|
||||
wait_count++;
|
||||
}
|
||||
unsynced_big_write_count = 0;
|
||||
unsynced_small_write_count = 0;
|
||||
unsynced_meta_write_count = 0;
|
||||
return true;
|
||||
}
|
||||
|
||||
int blockstore_impl_t::do_sync(blockstore_op_t *op, int base_state)
|
||||
{
|
||||
int op_state = PRIV(op)->op_state - base_state;
|
||||
if (op_state == 1) goto resume_1;
|
||||
if (op_state == 2) goto resume_2;
|
||||
assert(!op_state);
|
||||
if (flusher->get_syncing_buffer())
|
||||
{
|
||||
// Wait for flusher-initiated sync
|
||||
return 0;
|
||||
}
|
||||
if (dsk.disable_journal_fsync && dsk.disable_meta_fsync && dsk.disable_data_fsync || !unsynced_big_write_count && !unsynced_small_write_count)
|
||||
{
|
||||
// We can return immediately because sync only syncs previous writes
|
||||
unsynced_big_write_count = unsynced_small_write_count = unsynced_meta_write_count = 0;
|
||||
return 2;
|
||||
}
|
||||
PRIV(op)->lsn = heap->get_completed_lsn();
|
||||
if (!submit_fsyncs(PRIV(op)->pending_ops))
|
||||
{
|
||||
PRIV(op)->wait_detail = 1;
|
||||
PRIV(op)->wait_for = WAIT_SQE;
|
||||
return 0;
|
||||
}
|
||||
resume_1:
|
||||
if (PRIV(op)->pending_ops > 0)
|
||||
{
|
||||
PRIV(op)->op_state = base_state+1;
|
||||
return 1;
|
||||
}
|
||||
resume_2:
|
||||
heap->mark_lsn_fsynced(PRIV(op)->lsn);
|
||||
return 2;
|
||||
}
|
||||
|
File diff suppressed because it is too large
Load Diff
@@ -12,7 +12,7 @@
|
||||
// [LD_PRELOAD=libasan.so.8] \
|
||||
// fio -name=test -thread -ioengine=../build/src/blockstore/libfio_vitastor_blk.so \
|
||||
// -bs=4k -direct=1 -rw=randwrite -iodepth=16 -size=900M -loops=10 \
|
||||
// -bs_config='{"data_device":"./test_data.bin","meta_offset":0,"journal_offset":16777216,"data_offset":33554432,"disable_data_fsync":true,"immediate_commit":"all","journal_no_same_sector_overwrites":true}'
|
||||
// -bs_config='{"data_device":"./test_data.bin","meta_offset":0,"journal_offset":16777216,"data_offset":33554432,"disable_data_fsync":true,"meta_format":3,"immediate_commit":"all","log_level":100,"journal_no_same_sector_overwrites":true,"journal_sector_buffer_count":1024}'
|
||||
//
|
||||
// Linear write:
|
||||
//
|
||||
@@ -183,7 +183,7 @@ static enum fio_q_status bs_queue(struct thread_data *td, struct io_u *io)
|
||||
{
|
||||
case DDIR_READ:
|
||||
op->opcode = BS_OP_READ;
|
||||
op->buf = io->xfer_buf;
|
||||
op->buf = (uint8_t*)io->xfer_buf;
|
||||
op->oid = {
|
||||
.inode = 1,
|
||||
.stripe = io->offset / bsd->bs->get_block_size(),
|
||||
@@ -204,7 +204,7 @@ static enum fio_q_status bs_queue(struct thread_data *td, struct io_u *io)
|
||||
break;
|
||||
case DDIR_WRITE:
|
||||
op->opcode = BS_OP_WRITE_STABLE;
|
||||
op->buf = io->xfer_buf;
|
||||
op->buf = (uint8_t*)io->xfer_buf;
|
||||
op->oid = {
|
||||
.inode = 1,
|
||||
.stripe = io->offset / bsd->bs->get_block_size(),
|
||||
|
@@ -1,12 +1,11 @@
|
||||
// Old metadata format on-disk structures
|
||||
// Copyright (c) Vitaliy Filippov, 2019+
|
||||
// License: VNPL-1.1 (see README.md for details)
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "crc32c.h"
|
||||
#include <set>
|
||||
|
||||
#define MIN_JOURNAL_SIZE 4*1024*1024
|
||||
#define JOURNAL_MAGIC 0x4A33
|
||||
#define JOURNAL_VERSION_V1 1
|
||||
#define JOURNAL_VERSION_V2 2
|
||||
@@ -145,74 +144,14 @@ inline uint32_t je_crc32(journal_entry *je)
|
||||
return crc32c(0x48674bc7, ((uint8_t*)je)+4, je->size-4);
|
||||
}
|
||||
|
||||
struct journal_sector_info_t
|
||||
// 32 bytes = 24 bytes + block bitmap (4 bytes by default) + external attributes (also bitmap, 4 bytes by default)
|
||||
// per "clean" entry on disk with fixed metadata tables
|
||||
struct __attribute__((__packed__)) clean_disk_entry
|
||||
{
|
||||
uint64_t offset;
|
||||
uint64_t flush_count;
|
||||
bool written;
|
||||
bool dirty;
|
||||
uint64_t submit_id;
|
||||
object_id oid;
|
||||
uint64_t version;
|
||||
uint8_t bitmap[];
|
||||
// Two more fields come after bitmap in metadata version 2:
|
||||
// uint32_t data_csum[];
|
||||
// uint32_t entry_csum;
|
||||
};
|
||||
|
||||
struct pending_journaling_t
|
||||
{
|
||||
int pending;
|
||||
int sector;
|
||||
blockstore_op_t *op;
|
||||
};
|
||||
|
||||
struct journal_t
|
||||
{
|
||||
int fd;
|
||||
bool inmemory = false;
|
||||
bool flush_journal = false;
|
||||
void *buffer = NULL;
|
||||
|
||||
uint64_t block_size;
|
||||
uint64_t offset, len;
|
||||
// Next free block offset
|
||||
uint64_t next_free = 0;
|
||||
// First occupied block offset
|
||||
uint64_t used_start = 0;
|
||||
// End of the last block not used for writing anymore
|
||||
uint64_t dirty_start = 0;
|
||||
uint32_t crc32_last = 0;
|
||||
|
||||
// Current sector(s) used for writing
|
||||
void *sector_buf = NULL;
|
||||
journal_sector_info_t *sector_info = NULL;
|
||||
uint64_t sector_count;
|
||||
bool no_same_sector_overwrites = false;
|
||||
int cur_sector = 0;
|
||||
int in_sector_pos = 0;
|
||||
std::vector<int> submitting_sectors;
|
||||
std::multimap<uint64_t, pending_journaling_t> flushing_ops;
|
||||
uint64_t submit_id = 0;
|
||||
|
||||
// Used sector map
|
||||
// May use ~ 80 MB per 1 GB of used journal space in the worst case
|
||||
std::map<uint64_t, uint64_t> used_sectors;
|
||||
|
||||
~journal_t();
|
||||
bool trim();
|
||||
uint64_t get_trim_pos();
|
||||
void dump_diagnostics();
|
||||
inline bool entry_fits(int size)
|
||||
{
|
||||
return !(block_size - in_sector_pos < size ||
|
||||
no_same_sector_overwrites && sector_info[cur_sector].written);
|
||||
}
|
||||
};
|
||||
|
||||
struct blockstore_journal_check_t
|
||||
{
|
||||
blockstore_impl_t *bs;
|
||||
uint64_t next_pos, next_sector, next_in_pos;
|
||||
int sectors_to_write, first_sector;
|
||||
bool right_dir; // writing to the end or the beginning of the ring buffer
|
||||
|
||||
blockstore_journal_check_t(blockstore_impl_t *bs);
|
||||
int check_available(blockstore_op_t *op, int required, int size, int data_after);
|
||||
};
|
||||
|
||||
journal_entry* prefill_single_journal_entry(journal_t & journal, uint16_t type, uint32_t size);
|
338
src/blockstore/multilist.cpp
Normal file
338
src/blockstore/multilist.cpp
Normal file
@@ -0,0 +1,338 @@
|
||||
// Variable-length O(1) disk space allocator
|
||||
// Copyright (c) Vitaliy Filippov, 2025+
|
||||
// License: VNPL-1.1 (see README.md for details)
|
||||
|
||||
#include <assert.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <set>
|
||||
#include "multilist.h"
|
||||
|
||||
multilist_alloc_t::multilist_alloc_t(uint32_t count, uint32_t maxn):
|
||||
count(count), maxn(maxn)
|
||||
{
|
||||
// not-so-memory-efficient: 16 MB memory per 1 GB buffer space, but buffer spaces are small, so OK
|
||||
assert(count > 1 && count < 0x80000000);
|
||||
sizes.resize(count);
|
||||
nexts.resize(count); // nexts[i] = 0 -> area is used; nexts[i] = 1 -> no next; nexts[i] >= 2 -> next item
|
||||
prevs.resize(count);
|
||||
heads.resize(maxn); // heads[i] = 0 -> empty list; heads[i] >= 1 -> list head
|
||||
sizes[0] = count;
|
||||
sizes[count-1] = -count; // end
|
||||
nexts[0] = 1;
|
||||
heads[maxn-1] = 1;
|
||||
#ifdef MULTILIST_TRACE
|
||||
print();
|
||||
#endif
|
||||
}
|
||||
|
||||
bool multilist_alloc_t::is_free(uint32_t pos)
|
||||
{
|
||||
assert(pos < count);
|
||||
if (sizes[pos] < 0)
|
||||
pos += sizes[pos]+1;
|
||||
while (pos > 0 && !sizes[pos])
|
||||
pos--;
|
||||
return nexts[pos] > 0;
|
||||
}
|
||||
|
||||
uint32_t multilist_alloc_t::find(uint32_t size)
|
||||
{
|
||||
assert(size > 0);
|
||||
assert(size <= maxn);
|
||||
for (uint32_t i = size-1; i < maxn; i++)
|
||||
{
|
||||
if (heads[i])
|
||||
{
|
||||
return heads[i]-1;
|
||||
}
|
||||
}
|
||||
return UINT32_MAX;
|
||||
}
|
||||
|
||||
void multilist_alloc_t::verify()
|
||||
{
|
||||
std::set<uint32_t> reachable;
|
||||
for (int i = 0; i < maxn; i++)
|
||||
{
|
||||
uint32_t cur = heads[i];
|
||||
while (cur)
|
||||
{
|
||||
if (!nexts[cur-1])
|
||||
{
|
||||
fprintf(stderr, "ERROR: item %d from freelist %d is not free\n", cur-1, i);
|
||||
print();
|
||||
abort();
|
||||
}
|
||||
if (nexts[cur-1] >= count+2)
|
||||
{
|
||||
fprintf(stderr, "ERROR: next out of range at %d: %d\n", cur-1, nexts[cur-1]);
|
||||
print();
|
||||
abort();
|
||||
}
|
||||
if (!(i < maxn-1 ? sizes[cur-1] == i+1 : (sizes[cur-1] >= i+1)))
|
||||
{
|
||||
fprintf(stderr, "ERROR: item %d is in wrong freelist: expected size %d, but actual size is %d\n", cur-1, i+1, sizes[cur-1]);
|
||||
print();
|
||||
abort();
|
||||
}
|
||||
if (reachable.find(cur-1) != reachable.end())
|
||||
{
|
||||
fprintf(stderr, "ERROR: doubly-claimed item %d\n", cur-1);
|
||||
print();
|
||||
abort();
|
||||
}
|
||||
reachable.insert(cur-1);
|
||||
cur = nexts[cur-1]-1;
|
||||
}
|
||||
}
|
||||
for (int i = 0; i < count; )
|
||||
{
|
||||
if (sizes[i])
|
||||
{
|
||||
assert(i+sizes[i] <= count);
|
||||
if (sizes[i] > 1 && sizes[i+sizes[i]-1] != -sizes[i])
|
||||
{
|
||||
fprintf(stderr, "ERROR: start/end mismatch at %d: sizes[%d] should be %d, but is %d\n", i, i+sizes[i]-1, -sizes[i], sizes[i+sizes[i]-1]);
|
||||
print();
|
||||
abort();
|
||||
}
|
||||
for (int j = i+1; j < i+sizes[i]-1; j++)
|
||||
{
|
||||
if (sizes[j])
|
||||
{
|
||||
fprintf(stderr, "ERROR: internal non-zero at %d: %d\n", j, sizes[j]);
|
||||
print();
|
||||
abort();
|
||||
}
|
||||
}
|
||||
if (nexts[i] && reachable.find(i) == reachable.end())
|
||||
{
|
||||
fprintf(stderr, "ERROR: %d is unreachable from heads\n", i);
|
||||
print();
|
||||
abort();
|
||||
}
|
||||
if (nexts[i] >= 2)
|
||||
{
|
||||
if (nexts[i] >= 2+count)
|
||||
{
|
||||
fprintf(stderr, "ERROR: next out of range at %d: %d\n", i, nexts[i]);
|
||||
print();
|
||||
abort();
|
||||
}
|
||||
if (prevs[nexts[i]-2] != i+1)
|
||||
{
|
||||
fprintf(stderr, "ERROR: prev[next] (%d) != this (%d) at %d", prevs[nexts[i]-2], i+1, i);
|
||||
print();
|
||||
abort();
|
||||
}
|
||||
}
|
||||
i += (sizes[i] > 1 ? sizes[i] : 1);
|
||||
}
|
||||
else
|
||||
i++;
|
||||
}
|
||||
}
|
||||
|
||||
void multilist_alloc_t::print()
|
||||
{
|
||||
printf("heads:");
|
||||
for (int i = 0; i < maxn; i++)
|
||||
if (heads[i])
|
||||
printf(" %u=%u", i, heads[i]);
|
||||
printf("\n");
|
||||
printf("sizes:");
|
||||
for (int i = 0; i < count; i++)
|
||||
if (sizes[i])
|
||||
printf(" %d=%d", i, sizes[i]);
|
||||
printf("\n");
|
||||
printf("prevs:");
|
||||
for (int i = 0; i < count; i++)
|
||||
if (prevs[i])
|
||||
printf(" %d=%d", i, prevs[i]);
|
||||
printf("\n");
|
||||
printf("nexts:");
|
||||
for (int i = 0; i < count; i++)
|
||||
if (nexts[i])
|
||||
printf(" %d=%d", i, nexts[i]);
|
||||
printf("\n");
|
||||
printf("items:");
|
||||
for (int i = 0; i < count; )
|
||||
{
|
||||
if (sizes[i])
|
||||
{
|
||||
printf(" %u=(s:%d,n:%u,p:%u)", i, sizes[i], nexts[i], prevs[i]);
|
||||
assert(i+sizes[i] <= count);
|
||||
i += (sizes[i] > 1 ? sizes[i] : 1);
|
||||
}
|
||||
else
|
||||
i++;
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
void multilist_alloc_t::use(uint32_t pos, uint32_t size)
|
||||
{
|
||||
assert(pos+size <= count && size > 0);
|
||||
if (sizes[pos] <= 0)
|
||||
{
|
||||
uint32_t start = pos;
|
||||
if (sizes[start] < 0)
|
||||
start += sizes[start]+1;
|
||||
else
|
||||
while (start > 0 && !sizes[start])
|
||||
start--;
|
||||
assert(sizes[start] >= size);
|
||||
use_full(start);
|
||||
uint32_t full = sizes[start];
|
||||
sizes[pos-1] = -pos+start;
|
||||
sizes[start] = pos-start;
|
||||
free(start);
|
||||
sizes[pos+size-1] = -size;
|
||||
sizes[pos] = size;
|
||||
if (pos+size < start+full)
|
||||
{
|
||||
sizes[start+full-1] = -(start+full-pos-size);
|
||||
sizes[pos+size] = start+full-pos-size;
|
||||
free(pos+size);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
assert(sizes[pos] >= size);
|
||||
use_full(pos);
|
||||
if (sizes[pos] > size)
|
||||
{
|
||||
uint32_t full = sizes[pos];
|
||||
sizes[pos+size-1] = -size;
|
||||
sizes[pos] = size;
|
||||
sizes[pos+full-1] = -full+size;
|
||||
sizes[pos+size] = full-size;
|
||||
free(pos+size);
|
||||
}
|
||||
}
|
||||
#ifdef MULTILIST_TRACE
|
||||
print();
|
||||
#endif
|
||||
}
|
||||
|
||||
void multilist_alloc_t::use_full(uint32_t pos)
|
||||
{
|
||||
uint32_t prevsize = sizes[pos];
|
||||
assert(prevsize);
|
||||
assert(nexts[pos]);
|
||||
uint32_t pi = (prevsize < maxn ? prevsize : maxn)-1;
|
||||
if (heads[pi] == pos+1)
|
||||
heads[pi] = nexts[pos]-1;
|
||||
if (prevs[pos])
|
||||
nexts[prevs[pos]-1] = nexts[pos];
|
||||
if (nexts[pos] >= 2)
|
||||
prevs[nexts[pos]-2] = prevs[pos];
|
||||
prevs[pos] = 0;
|
||||
nexts[pos] = 0;
|
||||
}
|
||||
|
||||
void multilist_alloc_t::free(uint32_t pos)
|
||||
{
|
||||
do_free(pos);
|
||||
#ifdef MULTILIST_TRACE
|
||||
print();
|
||||
#endif
|
||||
}
|
||||
|
||||
void multilist_alloc_t::do_free(uint32_t pos)
|
||||
{
|
||||
assert(!nexts[pos]);
|
||||
uint32_t size = sizes[pos];
|
||||
assert(size > 0);
|
||||
// merge with previous?
|
||||
if (pos > 0 && nexts[pos+(sizes[pos-1] == 1 ? -1 : sizes[pos-1])] > 0)
|
||||
{
|
||||
assert(sizes[pos-1] < 0 || sizes[pos-1] == 1);
|
||||
uint32_t prevsize = sizes[pos-1] < 0 ? -sizes[pos-1] : 1;
|
||||
use_full(pos-prevsize);
|
||||
sizes[pos] = 0;
|
||||
sizes[pos-1] = 0;
|
||||
size += prevsize;
|
||||
pos -= prevsize;
|
||||
sizes[pos+size-1] = -size;
|
||||
sizes[pos] = size;
|
||||
}
|
||||
// merge with next?
|
||||
if (pos+size < count && nexts[pos+size] >= 1)
|
||||
{
|
||||
uint32_t nextsize = sizes[pos+size];
|
||||
use_full(pos+size);
|
||||
sizes[pos+size] = 0;
|
||||
sizes[pos+size-1] = 0;
|
||||
size += nextsize;
|
||||
sizes[pos+size-1] = -size;
|
||||
sizes[pos] = size;
|
||||
}
|
||||
uint32_t ni = (size < maxn ? size : maxn)-1;
|
||||
nexts[pos] = heads[ni]+1;
|
||||
prevs[pos] = 0;
|
||||
if (heads[ni])
|
||||
prevs[heads[ni]-1] = pos+1;
|
||||
heads[ni] = pos+1;
|
||||
}
|
||||
|
||||
multilist_index_t::multilist_index_t(uint32_t count, uint32_t max_used, uint32_t init_used):
|
||||
count(count), max_used(max_used)
|
||||
{
|
||||
assert(init_used < max_used);
|
||||
nexts.resize(count, UINT32_MAX);
|
||||
prevs.resize(count, UINT32_MAX);
|
||||
heads.resize(max_used, UINT32_MAX);
|
||||
for (size_t i = 0; i < count-1; i++)
|
||||
{
|
||||
nexts[i] = i+1;
|
||||
prevs[i+1] = i;
|
||||
}
|
||||
prevs[0] = UINT32_MAX;
|
||||
nexts[count-1] = UINT32_MAX;
|
||||
heads[init_used] = 0;
|
||||
}
|
||||
|
||||
uint32_t multilist_index_t::find(uint32_t wanted_used)
|
||||
{
|
||||
assert(wanted_used < max_used);
|
||||
return heads[wanted_used];
|
||||
}
|
||||
|
||||
void multilist_index_t::change(uint32_t pos, uint32_t old_used, uint32_t new_used)
|
||||
{
|
||||
if (new_used == old_used)
|
||||
return;
|
||||
assert(old_used < max_used && new_used < max_used);
|
||||
if (prevs[pos] != UINT32_MAX)
|
||||
nexts[prevs[pos]] = nexts[pos];
|
||||
if (nexts[pos] != UINT32_MAX)
|
||||
prevs[nexts[pos]] = prevs[pos];
|
||||
if (heads[old_used] == pos)
|
||||
heads[old_used] = nexts[pos];
|
||||
prevs[pos] = UINT32_MAX;
|
||||
if (heads[new_used] != UINT32_MAX)
|
||||
prevs[heads[new_used]] = pos;
|
||||
nexts[pos] = heads[new_used];
|
||||
heads[new_used] = pos;
|
||||
}
|
||||
|
||||
void multilist_index_t::print()
|
||||
{
|
||||
printf("heads:");
|
||||
for (int i = 0; i < max_used; i++)
|
||||
if (heads[i] != UINT32_MAX)
|
||||
printf(" %u=%u", i, heads[i]);
|
||||
printf("\n");
|
||||
printf("prevs:");
|
||||
for (int i = 0; i < count; i++)
|
||||
if (prevs[i] != UINT32_MAX)
|
||||
printf(" %d=%d", i, prevs[i]);
|
||||
printf("\n");
|
||||
printf("nexts:");
|
||||
for (int i = 0; i < count; i++)
|
||||
if (nexts[i] != UINT32_MAX)
|
||||
printf(" %d=%d", i, nexts[i]);
|
||||
printf("\n");
|
||||
}
|
37
src/blockstore/multilist.h
Normal file
37
src/blockstore/multilist.h
Normal file
@@ -0,0 +1,37 @@
|
||||
// Variable-length O(1) disk space allocator
|
||||
// Copyright (c) Vitaliy Filippov, 2025+
|
||||
// License: VNPL-1.1 (see README.md for details)
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <stdint.h>
|
||||
#include <vector>
|
||||
|
||||
struct multilist_alloc_t
|
||||
{
|
||||
const uint32_t count, maxn;
|
||||
std::vector<int32_t> sizes;
|
||||
std::vector<uint32_t> nexts, prevs, heads;
|
||||
|
||||
multilist_alloc_t(uint32_t count, uint32_t maxn);
|
||||
bool is_free(uint32_t pos);
|
||||
uint32_t find(uint32_t size);
|
||||
void use_full(uint32_t pos);
|
||||
void use(uint32_t pos, uint32_t size);
|
||||
void do_free(uint32_t pos);
|
||||
void free(uint32_t pos);
|
||||
void verify();
|
||||
void print();
|
||||
};
|
||||
|
||||
struct multilist_index_t
|
||||
{
|
||||
const uint32_t count, max_used;
|
||||
std::vector<uint32_t> nexts, prevs, heads;
|
||||
|
||||
// used should be always < max_used
|
||||
multilist_index_t(uint32_t count, uint32_t max_used, uint32_t init_used);
|
||||
uint32_t find(uint32_t wanted_used);
|
||||
void change(uint32_t pos, uint32_t old_used, uint32_t new_used);
|
||||
void print();
|
||||
};
|
@@ -1167,7 +1167,7 @@ void etcd_state_client_t::parse_state(const etcd_kv_t & kv)
|
||||
if (!cur_primary || !value["state"].is_array() || !state ||
|
||||
(state & PG_OFFLINE) && state != PG_OFFLINE ||
|
||||
(state & PG_PEERING) && state != PG_PEERING ||
|
||||
(state & PG_INCOMPLETE) && state != PG_INCOMPLETE)
|
||||
(state & PG_INCOMPLETE) && state != PG_INCOMPLETE && state != (PG_INCOMPLETE|PG_HAS_INVALID))
|
||||
{
|
||||
fprintf(stderr, "Unexpected pool %u PG %u state in etcd: primary=%ju, state=%s\n", pool_id, pg_num, cur_primary, value["state"].dump().c_str());
|
||||
return;
|
||||
|
@@ -6,8 +6,20 @@
|
||||
#include <stdint.h>
|
||||
#include <functional>
|
||||
|
||||
#define POOL_SCHEME_REPLICATED 1
|
||||
#define POOL_SCHEME_XOR 2
|
||||
#define POOL_SCHEME_EC 3
|
||||
#define POOL_ID_MAX 0x10000
|
||||
#define POOL_ID_BITS 16
|
||||
#define INODE_POOL(inode) (pool_id_t)((inode) >> (64 - POOL_ID_BITS))
|
||||
#define INODE_NO_POOL(inode) (inode_t)((inode) & (((uint64_t)1 << (64-POOL_ID_BITS)) - 1))
|
||||
#define INODE_WITH_POOL(pool_id, inode) (((inode_t)(pool_id) << (64-POOL_ID_BITS)) | INODE_NO_POOL(inode))
|
||||
|
||||
typedef uint64_t inode_t;
|
||||
|
||||
// Pool ID is 16 bits long
|
||||
typedef uint32_t pool_id_t;
|
||||
|
||||
// 16 bytes per object/stripe id
|
||||
// stripe = (start of the parity stripe + peer role)
|
||||
// i.e. for example (256KB + one of 0,1,2)
|
||||
|
@@ -44,11 +44,6 @@
|
||||
#define DIRECT_IO_ALIGNMENT 512
|
||||
#endif
|
||||
|
||||
// Memory allocation alignment (page size is usually optimal)
|
||||
#ifndef MEM_ALIGNMENT
|
||||
#define MEM_ALIGNMENT 4096
|
||||
#endif
|
||||
|
||||
// Constants for osd_reply_describe_item_t.loc_bad
|
||||
#define LOC_OUTDATED 1
|
||||
#define LOC_CORRUPTED 2
|
||||
|
@@ -7,7 +7,8 @@ add_executable(vitastor-disk
|
||||
disk_tool.cpp disk_simple_offsets.cpp
|
||||
disk_tool_discard.cpp disk_tool_journal.cpp disk_tool_meta.cpp disk_tool_prepare.cpp disk_tool_resize.cpp
|
||||
disk_tool_resize_auto.cpp disk_tool_udev.cpp disk_tool_utils.cpp disk_tool_upgrade.cpp
|
||||
../util/crc32c.c ../util/str_util.cpp ../util/json_util.cpp ../../json11/json11.cpp ../util/rw_blocking.cpp ../util/allocator.cpp ../util/ringloop.cpp ../blockstore/blockstore_disk.cpp
|
||||
../util/crc32c.c ../util/str_util.cpp ../util/json_util.cpp ../../json11/json11.cpp ../util/rw_blocking.cpp ../util/allocator.cpp ../util/ringloop.cpp
|
||||
../blockstore/blockstore_disk.cpp ../blockstore/blockstore_heap.cpp ../blockstore/multilist.cpp
|
||||
)
|
||||
target_link_libraries(vitastor-disk
|
||||
tcmalloc_minimal
|
||||
|
@@ -11,6 +11,7 @@
|
||||
#include "str_util.h"
|
||||
#include "blockstore.h"
|
||||
#include "blockstore_disk.h"
|
||||
#include "blockstore_heap.h"
|
||||
|
||||
// Calculate offsets for a block device and print OSD command line parameters
|
||||
void disk_tool_simple_offsets(json11::Json cfg, bool json_output)
|
||||
@@ -23,6 +24,9 @@ void disk_tool_simple_offsets(json11::Json cfg, bool json_output)
|
||||
uint64_t journal_offset = parse_size(cfg["journal_offset"].string_value());
|
||||
uint64_t device_size = parse_size(cfg["device_size"].string_value());
|
||||
uint32_t csum_block_size = parse_size(cfg["csum_block_size"].string_value());
|
||||
uint32_t meta_format = cfg["meta_format"].uint64_value();
|
||||
if (!meta_format)
|
||||
meta_format = BLOCKSTORE_META_FORMAT_HEAP;
|
||||
uint32_t data_csum_type = BLOCKSTORE_CSUM_NONE;
|
||||
if (cfg["data_csum_type"] == "crc32c")
|
||||
data_csum_type = BLOCKSTORE_CSUM_CRC32C;
|
||||
@@ -123,10 +127,39 @@ void disk_tool_simple_offsets(json11::Json cfg, bool json_output)
|
||||
uint64_t meta_offset = journal_offset + ((journal_size+device_block_size-1)/device_block_size)*device_block_size;
|
||||
uint64_t data_csum_size = (data_csum_type ? data_block_size/csum_block_size*(data_csum_type & 0xFF) : 0);
|
||||
uint64_t clean_entry_bitmap_size = data_block_size/bitmap_granularity/8;
|
||||
uint64_t clean_entry_size = 24 /*sizeof(clean_disk_entry)*/ + 2*clean_entry_bitmap_size + data_csum_size + 4 /*entry_csum*/;
|
||||
uint64_t entries_per_block = device_block_size / clean_entry_size;
|
||||
uint64_t object_count = ((device_size-meta_offset)/data_block_size);
|
||||
uint64_t meta_size = (1 + (object_count+entries_per_block-1)/entries_per_block) * device_block_size;
|
||||
uint64_t meta_size;
|
||||
if (meta_format == BLOCKSTORE_META_FORMAT_HEAP)
|
||||
{
|
||||
uint32_t min_object_size = sizeof(heap_object_t)+sizeof(heap_write_t)+data_csum_size+2*clean_entry_bitmap_size;
|
||||
uint32_t meta_block_target_free_space = cfg["meta_block_target_free_space"].uint64_value();
|
||||
if (!meta_block_target_free_space || meta_block_target_free_space > device_block_size-min_object_size)
|
||||
meta_block_target_free_space = 800;
|
||||
double meta_reserve = cfg["meta_reserve"].number_value();
|
||||
if (!meta_reserve)
|
||||
meta_reserve = 1.5;
|
||||
else if (meta_reserve < 1)
|
||||
meta_reserve = 1;
|
||||
uint32_t entries_per_block = (device_block_size-meta_block_target_free_space) / min_object_size;
|
||||
meta_size = device_block_size * (uint64_t)((object_count+entries_per_block-1) / entries_per_block * meta_reserve);
|
||||
}
|
||||
else if (meta_format == BLOCKSTORE_META_FORMAT_V2)
|
||||
{
|
||||
uint64_t clean_entry_size = 24 /*sizeof(clean_disk_entry)*/ + 2*clean_entry_bitmap_size + data_csum_size + 4 /*entry_csum*/;
|
||||
uint64_t entries_per_block = device_block_size / clean_entry_size;
|
||||
meta_size = (1 + (object_count+entries_per_block-1)/entries_per_block) * device_block_size;
|
||||
}
|
||||
else if (meta_format == BLOCKSTORE_META_FORMAT_V1)
|
||||
{
|
||||
uint64_t clean_entry_size = 24 /*sizeof(clean_disk_entry)*/ + 2*clean_entry_bitmap_size;
|
||||
uint64_t entries_per_block = device_block_size / clean_entry_size;
|
||||
meta_size = (1 + (object_count+entries_per_block-1)/entries_per_block) * device_block_size;
|
||||
}
|
||||
else
|
||||
{
|
||||
fprintf(stderr, "meta_format %u is not supported\n", meta_format);
|
||||
exit(1);
|
||||
}
|
||||
uint64_t data_offset = meta_offset + meta_size;
|
||||
if (format == "json")
|
||||
{
|
||||
|
@@ -48,6 +48,8 @@ static const char *help_text =
|
||||
" --max_other 10%\n"
|
||||
" Use disks for OSD data even if they already have non-Vitastor partitions,\n"
|
||||
" but only if these take up no more than this percent of disk space.\n"
|
||||
" --dry-run\n"
|
||||
" Check and print new OSD count for each disk but do not actually create them.\n"
|
||||
" \n"
|
||||
" Options (single-device mode):\n"
|
||||
" --data_device <DEV> Use partition <DEV> for data\n"
|
||||
@@ -179,8 +181,8 @@ static const char *help_text =
|
||||
" Options:\n"
|
||||
" --all Scan the whole journal area for entries and dump them, even outdated ones\n"
|
||||
" --json Dump journal in JSON format\n"
|
||||
" --format entries (Default) Dump actual journal entries as an array, without data\n"
|
||||
" --format data Same as \"entries\", but also include small write data\n"
|
||||
" --format data (Default) Dump journal entries as an array, with small write data\n"
|
||||
" --format entries Dump actual journal entries as an array, without data\n"
|
||||
" --format blocks Dump as an array of journal blocks each containing array of entries\n"
|
||||
"\n"
|
||||
"vitastor-disk write-journal <osd_device>\n"
|
||||
@@ -190,12 +192,12 @@ static const char *help_text =
|
||||
" You can specify any OSD device (data, metadata or journal), or the layout manually.\n"
|
||||
"\n"
|
||||
"vitastor-disk dump-meta <osd_device>\n"
|
||||
"vitastor-disk dump-meta <meta_file> <meta_block_size> <offset> <size>\n"
|
||||
"vitastor-disk dump-meta [osd_options...]\n"
|
||||
" Dump metadata in JSON format.\n"
|
||||
" You can specify any OSD device (data, metadata or journal), or the layout manually.\n"
|
||||
"\n"
|
||||
"vitastor-disk write-meta <osd_device>\n"
|
||||
"vitastor-disk write-meta <meta_file> <offset> <size>\n"
|
||||
"vitastor-disk write-meta [osd_options...]\n"
|
||||
" Write metadata from JSON taken from standard input in the same format as produced by `dump-meta`.\n"
|
||||
" You can specify any OSD device (data, metadata or journal), or the layout manually.\n"
|
||||
"\n"
|
||||
@@ -362,47 +364,63 @@ int main(int argc, char *argv[])
|
||||
}
|
||||
else if (!strcmp(cmd[0], "dump-meta"))
|
||||
{
|
||||
if (cmd.size() != 2 && cmd.size() < 5)
|
||||
{
|
||||
print_help(help_text, "vitastor-disk", cmd[0], false);
|
||||
return 1;
|
||||
}
|
||||
self.dsk.meta_device = cmd[1];
|
||||
if (cmd.size() > 2)
|
||||
if (cmd.size() == 5)
|
||||
{
|
||||
// Old format
|
||||
self.dsk.meta_device = cmd[1];
|
||||
self.dsk.meta_block_size = strtoul(cmd[2], NULL, 10);
|
||||
self.dsk.meta_offset = strtoull(cmd[3], NULL, 10);
|
||||
self.dsk.meta_len = strtoull(cmd[4], NULL, 10);
|
||||
self.dsk.meta_area_size = strtoull(cmd[4], NULL, 10);
|
||||
}
|
||||
else if (cmd.size() == 2)
|
||||
{
|
||||
// First argument is an OSD device - take metadata layout parameters from it
|
||||
self.dsk.meta_device = cmd[1];
|
||||
if (self.dump_load_check_superblock(self.dsk.meta_device))
|
||||
return 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
// First argument is an OSD device - take metadata layout parameters from it
|
||||
if (self.dump_load_check_superblock(self.dsk.meta_device))
|
||||
return 1;
|
||||
// Parse all OSD options from cmdline
|
||||
self.dsk.parse_config(self.options);
|
||||
if (self.options["io"] != "")
|
||||
self.dsk.data_io = self.dsk.meta_io = self.dsk.journal_io = self.options["io"];
|
||||
// FIXME: This is a really repeated pattern, move it somewhere
|
||||
self.dsk.open_data();
|
||||
self.dsk.open_meta();
|
||||
self.dsk.open_journal();
|
||||
self.dsk.calc_lengths();
|
||||
self.dsk.close_all();
|
||||
}
|
||||
return self.dump_meta();
|
||||
}
|
||||
else if (!strcmp(cmd[0], "write-meta"))
|
||||
{
|
||||
if (cmd.size() != 2 && cmd.size() < 4)
|
||||
{
|
||||
print_help(help_text, "vitastor-disk", cmd[0], false);
|
||||
return 1;
|
||||
}
|
||||
self.new_meta_device = cmd[1];
|
||||
if (cmd.size() > 2)
|
||||
if (cmd.size() == 4)
|
||||
{
|
||||
self.new_meta_device = cmd[1];
|
||||
self.new_meta_offset = strtoull(cmd[2], NULL, 10);
|
||||
self.new_meta_len = strtoull(cmd[3], NULL, 10);
|
||||
}
|
||||
else
|
||||
else if (cmd.size() == 2)
|
||||
{
|
||||
// First argument is an OSD device - take metadata layout parameters from it
|
||||
self.new_meta_device = cmd[1];
|
||||
if (self.dump_load_check_superblock(self.new_meta_device))
|
||||
return 1;
|
||||
self.new_meta_device = self.dsk.meta_device;
|
||||
self.new_meta_offset = self.dsk.meta_offset;
|
||||
self.new_meta_len = self.dsk.meta_len;
|
||||
self.new_meta_len = self.dsk.meta_area_size;
|
||||
}
|
||||
else
|
||||
{
|
||||
// Parse all OSD options from cmdline
|
||||
self.dsk.parse_config(self.options);
|
||||
self.dsk.open_data();
|
||||
self.dsk.open_meta();
|
||||
self.dsk.open_journal();
|
||||
self.dsk.calc_lengths();
|
||||
self.dsk.close_all();
|
||||
}
|
||||
std::string json_err;
|
||||
json11::Json meta = json11::Json::parse(read_all_fd(0), json_err);
|
||||
|
@@ -15,7 +15,9 @@
|
||||
#include "json11/json11.hpp"
|
||||
#include "blockstore_disk.h"
|
||||
#include "blockstore_impl.h"
|
||||
#include "meta_v1.h"
|
||||
#include "crc32c.h"
|
||||
#include "allocator.h"
|
||||
|
||||
// vITADisk
|
||||
#define VITASTOR_DISK_MAGIC 0x6b73694441544976
|
||||
@@ -43,7 +45,9 @@ struct disk_tool_t
|
||||
std::map<std::string, std::string> options;
|
||||
bool test_mode = false;
|
||||
bool all = false, json = false, now = false;
|
||||
bool dump_with_blocks, dump_with_data;
|
||||
bool dump_with_blocks = false, dump_with_data = false;
|
||||
bool dump_as_old = false;
|
||||
int log_level = 1;
|
||||
blockstore_disk_t dsk;
|
||||
|
||||
// resize data and/or move metadata and journal
|
||||
@@ -58,25 +62,30 @@ struct disk_tool_t
|
||||
uint64_t meta_pos;
|
||||
uint64_t journal_pos, journal_calc_data_pos;
|
||||
|
||||
uint8_t *buffer_area = NULL;
|
||||
bool first_block, first_entry;
|
||||
|
||||
allocator_t *data_alloc;
|
||||
allocator_t *data_alloc = NULL;
|
||||
std::map<uint64_t, uint64_t> data_remap;
|
||||
std::map<uint64_t, uint64_t>::iterator remap_it;
|
||||
ring_loop_t *ringloop;
|
||||
ring_loop_t *ringloop = NULL;
|
||||
ring_consumer_t ring_consumer;
|
||||
int remap_active;
|
||||
journal_entry_start je_start;
|
||||
uint8_t *new_journal_buf, *new_meta_buf, *new_journal_ptr, *new_journal_data;
|
||||
uint8_t *new_journal_buf = NULL, *new_meta_buf = NULL, *new_journal_ptr = NULL, *new_journal_data = NULL;
|
||||
blockstore_meta_header_v3_t *new_meta_hdr = NULL;
|
||||
blockstore_disk_t new_dsk;
|
||||
blockstore_heap_t *new_heap = NULL;
|
||||
uint64_t new_journal_in_pos;
|
||||
int64_t data_idx_diff;
|
||||
uint64_t total_blocks, free_first, free_last;
|
||||
uint64_t new_clean_entry_bitmap_size, new_data_csum_size, new_clean_entry_size, new_entries_per_block;
|
||||
int new_journal_fd, new_meta_fd;
|
||||
resizer_data_moving_t *moving_blocks;
|
||||
uint32_t new_meta_format = 0;
|
||||
int new_journal_fd = -1, new_meta_fd = -1;
|
||||
resizer_data_moving_t *moving_blocks = NULL;
|
||||
|
||||
bool started;
|
||||
void *small_write_data;
|
||||
void *small_write_data = NULL;
|
||||
uint32_t data_crc32;
|
||||
bool data_csum_valid;
|
||||
uint32_t crc32_last;
|
||||
@@ -88,17 +97,24 @@ struct disk_tool_t
|
||||
void dump_journal_entry(int num, journal_entry *je, bool json);
|
||||
int process_journal(std::function<int(void*)> block_fn, bool do_open = true);
|
||||
int process_journal_block(void *buf, std::function<void(int, journal_entry*)> iter_fn);
|
||||
int process_meta(std::function<void(blockstore_meta_header_v2_t *)> hdr_fn,
|
||||
std::function<void(uint64_t, clean_disk_entry*, uint8_t*)> record_fn, bool do_open = true);
|
||||
int process_meta(std::function<void(blockstore_meta_header_v3_t *)> hdr_fn,
|
||||
std::function<void(blockstore_heap_t *heap, heap_object_t *obj, uint32_t meta_block_num)> obj_fn,
|
||||
std::function<void(uint64_t block_num, clean_disk_entry *entry_v1, uint8_t *bitmap)> record_fn,
|
||||
bool with_data, bool do_open);
|
||||
|
||||
int dump_meta();
|
||||
void dump_meta_header(blockstore_meta_header_v2_t *hdr);
|
||||
void dump_meta_header(blockstore_meta_header_v3_t *hdr);
|
||||
void dump_meta_entry(uint64_t block_num, clean_disk_entry *entry, uint8_t *bitmap);
|
||||
void dump_heap_entry_as_old(blockstore_heap_t *heap, heap_object_t *obj);
|
||||
void dump_heap_entry(blockstore_heap_t *heap, heap_object_t *obj);
|
||||
|
||||
int dump_load_check_superblock(const std::string & device);
|
||||
|
||||
int write_json_journal(json11::Json entries);
|
||||
int write_json_meta(json11::Json meta);
|
||||
int write_json_heap(json11::Json meta, json11::Json journal);
|
||||
int index_journal_by_object(json11::Json journal,
|
||||
std::map<object_id, std::vector<json11::Json::object>> & journal_by_object);
|
||||
|
||||
int resize_data(std::string device);
|
||||
int resize_parse_move_journal(std::map<std::string, std::string> & move_options, bool dry_run);
|
||||
@@ -106,13 +122,17 @@ struct disk_tool_t
|
||||
|
||||
int raw_resize();
|
||||
int resize_parse_params();
|
||||
void resize_init(blockstore_meta_header_v2_t *hdr);
|
||||
void resize_init(blockstore_meta_header_v3_t *hdr);
|
||||
int resize_remap_blocks();
|
||||
int resize_copy_data();
|
||||
int resize_rewrite_journal();
|
||||
void resize_alloc_journal();
|
||||
void build_journal_start();
|
||||
void choose_journal_block(uint32_t je_size);
|
||||
int resize_rebuild_journal();
|
||||
int resize_write_new_journal();
|
||||
int resize_rewrite_meta();
|
||||
int resize_rebuild_meta();
|
||||
int resize_write_new_meta();
|
||||
void free_new_meta();
|
||||
|
||||
int udev_import(std::string device);
|
||||
int read_sb(std::string device);
|
||||
@@ -134,7 +154,8 @@ struct disk_tool_t
|
||||
int prepare(std::vector<std::string> devices);
|
||||
std::vector<vitastor_dev_info_t> collect_devices(const std::vector<std::string> & devices);
|
||||
json11::Json add_partitions(vitastor_dev_info_t & devinfo, std::vector<std::string> sizes);
|
||||
std::vector<std::string> get_new_data_parts(vitastor_dev_info_t & dev, uint64_t osd_per_disk, uint64_t max_other_percent);
|
||||
std::vector<std::string> get_new_data_parts(vitastor_dev_info_t & dev,
|
||||
uint64_t osd_per_disk, uint64_t max_other_percent, uint64_t *check_new_count);
|
||||
int get_meta_partition(std::vector<vitastor_dev_info_t> & ssds, std::map<std::string, std::string> & options);
|
||||
|
||||
int upgrade_simple_unit(std::string unit);
|
||||
|
@@ -54,12 +54,22 @@ int disk_tool_t::trim_data(std::string device)
|
||||
fprintf(stderr, "Reading metadata\n");
|
||||
data_alloc = new allocator_t(dsk.block_count);
|
||||
r = process_meta(
|
||||
[this](blockstore_meta_header_v2_t *hdr) {},
|
||||
[this](blockstore_meta_header_v3_t *hdr) {},
|
||||
[this](blockstore_heap_t *heap, heap_object_t *obj, uint32_t meta_block_num)
|
||||
{
|
||||
for (auto wr = obj->get_writes(); wr; wr = wr->next())
|
||||
{
|
||||
if ((wr->entry_type & BS_HEAP_TYPE) == BS_HEAP_BIG_WRITE)
|
||||
{
|
||||
data_alloc->set(wr->big_location(heap) / dsk.data_block_size, true);
|
||||
}
|
||||
}
|
||||
},
|
||||
[this](uint64_t block_num, clean_disk_entry *entry, uint8_t *bitmap)
|
||||
{
|
||||
data_alloc->set(block_num, true);
|
||||
},
|
||||
false
|
||||
false, false
|
||||
);
|
||||
if (r != 0)
|
||||
{
|
||||
@@ -83,7 +93,7 @@ int disk_tool_t::trim_data(std::string device)
|
||||
return r;
|
||||
}
|
||||
// Trim
|
||||
r = dsk.trim_data(data_alloc);
|
||||
r = dsk.trim_data([&](uint64_t block_num) { return data_alloc->get(block_num); });
|
||||
dsk.close_all();
|
||||
return r == 0;
|
||||
}
|
||||
|
@@ -5,8 +5,8 @@
|
||||
|
||||
int disk_tool_t::dump_journal()
|
||||
{
|
||||
dump_with_blocks = options["format"] == "blocks";
|
||||
dump_with_data = options["format"] == "data" || options["format"] == "blocks,data";
|
||||
dump_with_blocks = options["format"] == "blocks" || options["format"] == "blocks,data";
|
||||
dump_with_data = options["format"] == "data" || options["format"] == "blocks,data" || options["format"] == "";
|
||||
if (dsk.journal_block_size < DIRECT_IO_ALIGNMENT || (dsk.journal_block_size % DIRECT_IO_ALIGNMENT) ||
|
||||
dsk.journal_block_size > 128*1024)
|
||||
{
|
||||
@@ -525,7 +525,9 @@ int disk_tool_t::write_json_journal(json11::Json entries)
|
||||
.data_offset = (uint64_t)(new_journal_data-new_journal_buf),
|
||||
.crc32_data = !dsk.data_csum_type ? 0 : (uint32_t)sscanf_json("%x", rec["data_crc32"]),
|
||||
};
|
||||
uint32_t data_csum_size = !dsk.data_csum_type ? 0 : ne->small_write.len/dsk.csum_block_size*(dsk.data_csum_type & 0xFF);
|
||||
uint32_t data_csum_blocks = !dsk.data_csum_type ? 0 :
|
||||
(((ne->small_write.offset+ne->small_write.len)/dsk.csum_block_size - ne->small_write.len/dsk.csum_block_size));
|
||||
uint32_t data_csum_size = data_csum_blocks*(dsk.data_csum_type & 0xFF);
|
||||
fromhexstr(rec["bitmap"].string_value(), dsk.clean_entry_bitmap_size, ((uint8_t*)ne) + sizeof(journal_entry_small_write) + data_csum_size);
|
||||
fromhexstr(rec["data"].string_value(), ne->small_write.len, new_journal_data);
|
||||
if (ne->small_write.len > 0 && !rec["data"].is_string())
|
||||
@@ -534,17 +536,21 @@ int disk_tool_t::write_json_journal(json11::Json entries)
|
||||
free(new_journal_buf);
|
||||
return 1;
|
||||
}
|
||||
if (dsk.data_csum_type)
|
||||
fromhexstr(rec["block_csums"].string_value(), data_csum_size, ((uint8_t*)ne) + sizeof(journal_entry_small_write));
|
||||
if (rec["data"].is_string())
|
||||
if (ne->small_write.len > 0)
|
||||
{
|
||||
if (!dsk.data_csum_type)
|
||||
ne->small_write.crc32_data = crc32c(0, new_journal_data, ne->small_write.len);
|
||||
else if (dsk.data_csum_type == BLOCKSTORE_CSUM_CRC32C)
|
||||
{
|
||||
uint32_t *block_csums = (uint32_t*)(((uint8_t*)ne) + sizeof(journal_entry_small_write));
|
||||
for (uint32_t i = 0; i < ne->small_write.len; i += dsk.csum_block_size, block_csums++)
|
||||
*block_csums = crc32c(0, new_journal_data+i, dsk.csum_block_size);
|
||||
for (uint32_t i = 0; i < data_csum_blocks; i++)
|
||||
{
|
||||
uint32_t block_begin = (ne->small_write.offset/dsk.csum_block_size + i) * dsk.csum_block_size;
|
||||
uint32_t block_end = (ne->small_write.offset/dsk.csum_block_size + (i+1)) * dsk.csum_block_size;
|
||||
block_begin = block_begin < ne->small_write.offset ? ne->small_write.offset : block_begin;
|
||||
block_end = block_end > ne->small_write.offset+ne->small_write.len ? ne->small_write.offset+ne->small_write.len : block_end;
|
||||
block_csums[i] = crc32c(0, new_journal_data+block_begin-ne->small_write.offset, block_end-block_begin);
|
||||
}
|
||||
}
|
||||
}
|
||||
new_journal_data += ne->small_write.len;
|
||||
@@ -565,7 +571,9 @@ int disk_tool_t::write_json_journal(json11::Json entries)
|
||||
.len = (uint32_t)rec["len"].uint64_value(),
|
||||
.location = sscanf_json(NULL, rec["loc"]),
|
||||
};
|
||||
uint32_t data_csum_size = !dsk.data_csum_type ? 0 : ne->big_write.len/dsk.csum_block_size*(dsk.data_csum_type & 0xFF);
|
||||
uint32_t data_csum_blocks = !dsk.data_csum_type ? 0 :
|
||||
(((ne->small_write.offset+ne->small_write.len)/dsk.csum_block_size - ne->small_write.len/dsk.csum_block_size));
|
||||
uint32_t data_csum_size = data_csum_blocks*(dsk.data_csum_type & 0xFF);
|
||||
fromhexstr(rec["bitmap"].string_value(), dsk.clean_entry_bitmap_size, ((uint8_t*)ne) + sizeof(journal_entry_big_write) + data_csum_size);
|
||||
if (dsk.data_csum_type)
|
||||
fromhexstr(rec["block_csums"].string_value(), data_csum_size, ((uint8_t*)ne) + sizeof(journal_entry_big_write));
|
||||
|
@@ -6,40 +6,120 @@
|
||||
#include "osd_id.h"
|
||||
#include "json_util.h"
|
||||
|
||||
int disk_tool_t::process_meta(std::function<void(blockstore_meta_header_v2_t *)> hdr_fn,
|
||||
std::function<void(uint64_t, clean_disk_entry*, uint8_t*)> record_fn, bool do_open)
|
||||
#define FREE_SPACE_BIT 0x8000
|
||||
|
||||
int disk_tool_t::process_meta(std::function<void(blockstore_meta_header_v3_t *)> hdr_fn,
|
||||
std::function<void(blockstore_heap_t *heap, heap_object_t *obj, uint32_t meta_block_num)> obj_fn,
|
||||
std::function<void(uint64_t block_num, clean_disk_entry *entry_v1, uint8_t *bitmap)> record_fn,
|
||||
bool with_data, bool do_open)
|
||||
{
|
||||
int r = 0;
|
||||
if (dsk.meta_block_size % DIRECT_IO_ALIGNMENT)
|
||||
{
|
||||
fprintf(stderr, "Invalid metadata block size: is not a multiple of %d\n", DIRECT_IO_ALIGNMENT);
|
||||
return 1;
|
||||
}
|
||||
int buf_size = 1024*1024;
|
||||
if (buf_size % dsk.meta_block_size)
|
||||
buf_size = 8*dsk.meta_block_size;
|
||||
uint8_t *data = NULL;
|
||||
data = (uint8_t*)memalign_or_die(MEM_ALIGNMENT, buf_size);
|
||||
blockstore_meta_header_v3_t *hdr = (blockstore_meta_header_v3_t *)data;
|
||||
if (do_open)
|
||||
{
|
||||
if (dsk.meta_fd >= 0)
|
||||
{
|
||||
fprintf(stderr, "Bug: Metadata device is already opened\n");
|
||||
return 1;
|
||||
close_error:
|
||||
r = 1;
|
||||
goto close_free;
|
||||
}
|
||||
dsk.meta_fd = open(dsk.meta_device.c_str(), (options["io"] == "cached" ? 0 : O_DIRECT) | O_RDONLY);
|
||||
if (dsk.meta_fd < 0)
|
||||
{
|
||||
fprintf(stderr, "Failed to open metadata device %s: %s\n", dsk.meta_device.c_str(), strerror(errno));
|
||||
return 1;
|
||||
goto close_error;
|
||||
}
|
||||
}
|
||||
int buf_size = 1024*1024;
|
||||
if (buf_size % dsk.meta_block_size)
|
||||
buf_size = 8*dsk.meta_block_size;
|
||||
if (buf_size > dsk.meta_len)
|
||||
buf_size = dsk.meta_len;
|
||||
void *data = memalign_or_die(MEM_ALIGNMENT, buf_size);
|
||||
lseek64(dsk.meta_fd, dsk.meta_offset, 0);
|
||||
read_blocking(dsk.meta_fd, data, dsk.meta_block_size);
|
||||
// Check superblock
|
||||
blockstore_meta_header_v2_t *hdr = (blockstore_meta_header_v2_t *)data;
|
||||
if (hdr->zero == 0 && hdr->magic == BLOCKSTORE_META_MAGIC_V1)
|
||||
else if (dsk.meta_fd < 0)
|
||||
{
|
||||
fprintf(stderr, "Bug: Metadata device is not opened\n");
|
||||
goto close_error;
|
||||
}
|
||||
// Check superblock
|
||||
lseek64(dsk.meta_fd, dsk.meta_offset, 0);
|
||||
read_blocking(dsk.meta_fd, hdr, dsk.meta_block_size);
|
||||
if (hdr->zero == 0 && hdr->magic == BLOCKSTORE_META_MAGIC_V1 && hdr->version == BLOCKSTORE_META_FORMAT_HEAP)
|
||||
{
|
||||
if (hdr->data_csum_type != 0 &&
|
||||
hdr->data_csum_type != BLOCKSTORE_CSUM_CRC32C)
|
||||
{
|
||||
goto csum_unknown;
|
||||
}
|
||||
if (!dsk.journal_len && !with_data)
|
||||
{
|
||||
fprintf(stderr, "Buffer area (former journal) location must be specified to dump \"heap\" with data\n");
|
||||
goto close_error;
|
||||
}
|
||||
// Load buffer_area
|
||||
if (with_data)
|
||||
{
|
||||
buffer_area = (uint8_t*)memalign_or_die(MEM_ALIGNMENT, dsk.journal_len);
|
||||
if (dsk.journal_device == dsk.meta_device || dsk.journal_device == "")
|
||||
{
|
||||
dsk.journal_fd = dsk.meta_fd;
|
||||
}
|
||||
else if (do_open)
|
||||
{
|
||||
if (dsk.journal_fd >= 0)
|
||||
{
|
||||
fprintf(stderr, "Bug: Metadata device is already opened\n");
|
||||
goto close_error;
|
||||
}
|
||||
dsk.journal_fd = open(dsk.journal_device.c_str(), (options["io"] == "cached" ? 0 : O_DIRECT) | O_RDONLY);
|
||||
if (dsk.journal_fd < 0)
|
||||
{
|
||||
fprintf(stderr, "Failed to open journal device %s: %s\n", dsk.journal_device.c_str(), strerror(errno));
|
||||
goto close_error;
|
||||
}
|
||||
}
|
||||
else if (dsk.journal_fd < 0)
|
||||
{
|
||||
fprintf(stderr, "Bug: journal device is not opened\n");
|
||||
goto close_error;
|
||||
}
|
||||
uint64_t journal_pos = 0;
|
||||
lseek64(dsk.journal_fd, dsk.journal_offset+journal_pos, 0);
|
||||
while (journal_pos < dsk.journal_len)
|
||||
{
|
||||
uint64_t read_len = buf_size < dsk.journal_len-journal_pos ? buf_size : dsk.journal_len-journal_pos;
|
||||
read_blocking(dsk.journal_fd, buffer_area+journal_pos, read_len);
|
||||
journal_pos += read_len;
|
||||
}
|
||||
}
|
||||
blockstore_heap_t *heap = new blockstore_heap_t(&dsk, buffer_area, log_level);
|
||||
// Load heap and just iterate it in memory
|
||||
hdr_fn(hdr);
|
||||
hdr = NULL;
|
||||
meta_pos = dsk.meta_block_size;
|
||||
lseek64(dsk.meta_fd, dsk.meta_offset+meta_pos, 0);
|
||||
while (meta_pos < dsk.meta_area_size)
|
||||
{
|
||||
uint64_t read_len = buf_size < dsk.meta_area_size-meta_pos ? buf_size : dsk.meta_area_size-meta_pos;
|
||||
read_blocking(dsk.meta_fd, data, read_len);
|
||||
heap->read_blocks(meta_pos-dsk.meta_block_size, read_len, data, [&](heap_object_t *obj)
|
||||
{
|
||||
obj_fn(heap, obj, ((uint8_t*)obj-data+meta_pos)/dsk.meta_block_size);
|
||||
}, [](uint32_t, uint32_t, uint8_t*){});
|
||||
meta_pos += read_len;
|
||||
}
|
||||
delete heap;
|
||||
}
|
||||
else if (hdr->zero == 0 && hdr->magic == BLOCKSTORE_META_MAGIC_V1)
|
||||
{
|
||||
dsk.meta_format = hdr->version;
|
||||
dsk.calc_lengths();
|
||||
dsk.check_lengths();
|
||||
if (hdr->version == BLOCKSTORE_META_FORMAT_V1)
|
||||
{
|
||||
// Vitastor 0.6-0.8 - static array of clean_disk_entry with bitmaps
|
||||
@@ -53,41 +133,21 @@ int disk_tool_t::process_meta(std::function<void(blockstore_meta_header_v2_t *)>
|
||||
if (hdr->data_csum_type != 0 &&
|
||||
hdr->data_csum_type != BLOCKSTORE_CSUM_CRC32C)
|
||||
{
|
||||
csum_unknown:
|
||||
fprintf(stderr, "I don't know checksum format %u, the only supported format is crc32c = %u.\n", hdr->data_csum_type, BLOCKSTORE_CSUM_CRC32C);
|
||||
free(data);
|
||||
if (do_open)
|
||||
{
|
||||
close(dsk.meta_fd);
|
||||
dsk.meta_fd = -1;
|
||||
}
|
||||
return 1;
|
||||
goto close_error;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// Unsupported version
|
||||
fprintf(stderr, "Metadata format is too new for me (stored version is %ju, max supported %u).\n", hdr->version, BLOCKSTORE_META_FORMAT_V2);
|
||||
free(data);
|
||||
if (do_open)
|
||||
{
|
||||
close(dsk.meta_fd);
|
||||
dsk.meta_fd = -1;
|
||||
}
|
||||
return 1;
|
||||
goto close_error;
|
||||
}
|
||||
if (hdr->meta_block_size != dsk.meta_block_size)
|
||||
{
|
||||
fprintf(stderr, "Using block size of %u bytes based on information from the superblock\n", hdr->meta_block_size);
|
||||
dsk.meta_block_size = hdr->meta_block_size;
|
||||
if (buf_size % dsk.meta_block_size)
|
||||
{
|
||||
buf_size = 8*dsk.meta_block_size;
|
||||
void *new_data = memalign_or_die(MEM_ALIGNMENT, buf_size);
|
||||
memcpy(new_data, data, dsk.meta_block_size);
|
||||
free(data);
|
||||
data = new_data;
|
||||
hdr = (blockstore_meta_header_v2_t *)data;
|
||||
}
|
||||
}
|
||||
dsk.meta_format = hdr->version;
|
||||
dsk.data_block_size = hdr->data_block_size;
|
||||
@@ -101,14 +161,15 @@ int disk_tool_t::process_meta(std::function<void(blockstore_meta_header_v2_t *)>
|
||||
*(hdr->data_csum_type & 0xff))
|
||||
: 0)
|
||||
+ (dsk.meta_format == BLOCKSTORE_META_FORMAT_V2 ? 4 /*entry_csum*/ : 0);
|
||||
// Read
|
||||
uint64_t block_num = 0;
|
||||
hdr_fn(hdr);
|
||||
hdr = NULL;
|
||||
meta_pos = dsk.meta_block_size;
|
||||
lseek64(dsk.meta_fd, dsk.meta_offset+meta_pos, 0);
|
||||
while (meta_pos < dsk.meta_len)
|
||||
while (meta_pos < dsk.min_meta_len)
|
||||
{
|
||||
uint64_t read_len = buf_size < dsk.meta_len-meta_pos ? buf_size : dsk.meta_len-meta_pos;
|
||||
uint64_t read_len = buf_size < dsk.min_meta_len-meta_pos ? buf_size : dsk.min_meta_len-meta_pos;
|
||||
read_blocking(dsk.meta_fd, data, read_len);
|
||||
meta_pos += read_len;
|
||||
for (uint64_t blk = 0; blk < read_len; blk += dsk.meta_block_size)
|
||||
@@ -123,7 +184,7 @@ int disk_tool_t::process_meta(std::function<void(blockstore_meta_header_v2_t *)>
|
||||
uint32_t *entry_csum = (uint32_t*)((uint8_t*)entry + dsk.clean_entry_size - 4);
|
||||
if (*entry_csum != crc32c(0, entry, dsk.clean_entry_size - 4))
|
||||
{
|
||||
fprintf(stderr, "Metadata entry %ju is corrupt (checksum mismatch), skipping\n", block_num);
|
||||
fprintf(stderr, "Metadata entry %lu is corrupt (checksum mismatch), skipping\n", block_num);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
@@ -135,14 +196,15 @@ int disk_tool_t::process_meta(std::function<void(blockstore_meta_header_v2_t *)>
|
||||
}
|
||||
else
|
||||
{
|
||||
// Vitastor 0.4-0.5 - static array of clean_disk_entry
|
||||
// Vitastor 0.4-0.5 - static array of clean_disk_entry without header
|
||||
lseek64(dsk.meta_fd, dsk.meta_offset, 0);
|
||||
dsk.clean_entry_bitmap_size = 0;
|
||||
dsk.clean_entry_size = sizeof(clean_disk_entry);
|
||||
uint64_t block_num = 0;
|
||||
hdr_fn(NULL);
|
||||
while (meta_pos < dsk.meta_len)
|
||||
while (meta_pos < dsk.meta_area_size)
|
||||
{
|
||||
uint64_t read_len = buf_size < dsk.meta_len-meta_pos ? buf_size : dsk.meta_len-meta_pos;
|
||||
uint64_t read_len = buf_size < dsk.meta_area_size-meta_pos ? buf_size : dsk.meta_area_size-meta_pos;
|
||||
read_blocking(dsk.meta_fd, data, read_len);
|
||||
meta_pos += read_len;
|
||||
for (uint64_t blk = 0; blk < read_len; blk += dsk.meta_block_size)
|
||||
@@ -158,13 +220,25 @@ int disk_tool_t::process_meta(std::function<void(blockstore_meta_header_v2_t *)>
|
||||
}
|
||||
}
|
||||
}
|
||||
close_free:
|
||||
free(data);
|
||||
if (buffer_area)
|
||||
{
|
||||
free(buffer_area);
|
||||
buffer_area = NULL;
|
||||
}
|
||||
if (do_open)
|
||||
{
|
||||
close(dsk.meta_fd);
|
||||
dsk.meta_fd = -1;
|
||||
if (dsk.journal_fd >= 0)
|
||||
{
|
||||
if (dsk.journal_fd != dsk.meta_fd)
|
||||
close(dsk.journal_fd);
|
||||
dsk.journal_fd = -1;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
return r;
|
||||
}
|
||||
|
||||
int disk_tool_t::dump_load_check_superblock(const std::string & device)
|
||||
@@ -180,7 +254,7 @@ int disk_tool_t::dump_load_check_superblock(const std::string & device)
|
||||
dsk.open_data();
|
||||
dsk.open_meta();
|
||||
dsk.open_journal();
|
||||
dsk.calc_lengths(true);
|
||||
dsk.calc_lengths();
|
||||
}
|
||||
catch (std::exception & e)
|
||||
{
|
||||
@@ -195,15 +269,33 @@ int disk_tool_t::dump_load_check_superblock(const std::string & device)
|
||||
int disk_tool_t::dump_meta()
|
||||
{
|
||||
int r = process_meta(
|
||||
[this](blockstore_meta_header_v2_t *hdr) { dump_meta_header(hdr); },
|
||||
[this](uint64_t block_num, clean_disk_entry *entry, uint8_t *bitmap) { dump_meta_entry(block_num, entry, bitmap); }
|
||||
[this](blockstore_meta_header_v3_t *hdr)
|
||||
{
|
||||
if (dump_as_old)
|
||||
{
|
||||
hdr->version = BLOCKSTORE_META_FORMAT_V2;
|
||||
hdr->compacted_lsn = 0;
|
||||
hdr->header_csum = 0;
|
||||
hdr->header_csum = crc32c(0, hdr, sizeof(blockstore_meta_header_v2_t));
|
||||
}
|
||||
dump_meta_header(hdr);
|
||||
},
|
||||
[this](blockstore_heap_t *heap, heap_object_t *obj, uint32_t meta_block_num)
|
||||
{
|
||||
if (dump_as_old)
|
||||
dump_heap_entry_as_old(heap, obj);
|
||||
else
|
||||
dump_heap_entry(heap, obj);
|
||||
},
|
||||
[this](uint64_t block_num, clean_disk_entry *entry, uint8_t *bitmap) { dump_meta_entry(block_num, entry, bitmap); },
|
||||
true, true
|
||||
);
|
||||
if (r == 0)
|
||||
printf("\n]}\n");
|
||||
return r;
|
||||
}
|
||||
|
||||
void disk_tool_t::dump_meta_header(blockstore_meta_header_v2_t *hdr)
|
||||
void disk_tool_t::dump_meta_header(blockstore_meta_header_v3_t *hdr)
|
||||
{
|
||||
if (hdr)
|
||||
{
|
||||
@@ -224,14 +316,151 @@ void disk_tool_t::dump_meta_header(blockstore_meta_header_v2_t *hdr)
|
||||
csum_type_str(hdr->data_csum_type).c_str(), hdr->csum_block_size
|
||||
);
|
||||
}
|
||||
else if (hdr->version == BLOCKSTORE_META_FORMAT_HEAP)
|
||||
{
|
||||
printf(
|
||||
"{\"version\":\"3.0\",\"meta_block_size\":%u,\"data_block_size\":%u,\"bitmap_granularity\":%u,"
|
||||
"\"data_csum_type\":\"%s\",\"csum_block_size\":%u,\"entries\":[\n",
|
||||
hdr->meta_block_size, hdr->data_block_size, hdr->bitmap_granularity,
|
||||
csum_type_str(hdr->data_csum_type).c_str(), hdr->csum_block_size
|
||||
);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
printf("{\"version\":\"0.5\",\"meta_block_size\":%ju,\"entries\":[\n", dsk.meta_block_size);
|
||||
printf("{\"version\":\"0.5\",\"meta_block_size\":%u,\"entries\":[\n", dsk.meta_block_size);
|
||||
}
|
||||
first_entry = true;
|
||||
}
|
||||
|
||||
void disk_tool_t::dump_heap_entry_as_old(blockstore_heap_t *heap, heap_object_t *obj)
|
||||
{
|
||||
heap_write_t *wr = NULL;
|
||||
for (wr = obj->get_writes(); wr && wr->entry_type != (BS_HEAP_BIG_WRITE|BS_HEAP_STABLE) &&
|
||||
wr->entry_type != (BS_HEAP_TOMBSTONE|BS_HEAP_STABLE); wr = wr->next())
|
||||
{
|
||||
}
|
||||
if (!wr || wr->entry_type != (BS_HEAP_BIG_WRITE|BS_HEAP_STABLE))
|
||||
{
|
||||
return;
|
||||
}
|
||||
printf(
|
||||
#define ENTRY_FMT "{\"block\":%u,\"pool\":%u,\"inode\":\"0x%jx\",\"stripe\":\"0x%jx\",\"version\":%ju"
|
||||
(first_entry ? ENTRY_FMT : (",\n" ENTRY_FMT)),
|
||||
#undef ENTRY_FMT
|
||||
wr->big().block_num, INODE_POOL(obj->inode), INODE_NO_POOL(obj->inode),
|
||||
obj->stripe, wr->version
|
||||
);
|
||||
printf(",\"bitmap\":\"");
|
||||
uint8_t* bitmap = wr->get_int_bitmap(heap);
|
||||
for (uint64_t i = 0; i < dsk.clean_entry_bitmap_size; i++)
|
||||
{
|
||||
printf("%02x", bitmap[i]);
|
||||
}
|
||||
bitmap = wr->get_ext_bitmap(heap);
|
||||
printf("\",\"ext_bitmap\":\"");
|
||||
for (uint64_t i = 0; i < dsk.clean_entry_bitmap_size; i++)
|
||||
{
|
||||
printf("%02x", bitmap[i]);
|
||||
}
|
||||
uint8_t *csums = wr->get_checksums(heap);
|
||||
uint32_t csum_size = wr->get_csum_size(heap);
|
||||
if (csums)
|
||||
{
|
||||
printf("\",\"block_csums\":\"");
|
||||
for (uint32_t i = 0; i < csum_size; i++)
|
||||
{
|
||||
printf("%02x", csums[i]);
|
||||
}
|
||||
}
|
||||
if (wr->get_checksum(heap))
|
||||
{
|
||||
printf("\",\"crc32c\":\"%08x", *wr->get_checksum(heap));
|
||||
}
|
||||
printf("\"}");
|
||||
first_entry = false;
|
||||
}
|
||||
|
||||
void disk_tool_t::dump_heap_entry(blockstore_heap_t *heap, heap_object_t *obj)
|
||||
{
|
||||
printf(
|
||||
#define ENTRY_FMT "{\"pool\":%u,\"inode\":\"0x%jx\",\"stripe\":\"0x%jx\",\"writes\":["
|
||||
(first_entry ? ENTRY_FMT : (",\n" ENTRY_FMT)),
|
||||
#undef ENTRY_FMT
|
||||
INODE_POOL(obj->inode), INODE_NO_POOL(obj->inode), obj->stripe
|
||||
);
|
||||
heap_write_t *wr = NULL;
|
||||
bool first_wr = true;
|
||||
for (wr = obj->get_writes(); wr; wr = wr->next())
|
||||
{
|
||||
printf(
|
||||
#define ENTRY_FMT "{\"lsn\":%ju,\"version\":%ju,\"type\":\"%s\",\"stable\":%s"
|
||||
(first_wr ? ENTRY_FMT : ("," ENTRY_FMT)),
|
||||
#undef ENTRY_FMT
|
||||
wr->lsn, wr->version, (wr->entry_type & BS_HEAP_TYPE) == BS_HEAP_SMALL_WRITE ? "small" : (
|
||||
(wr->entry_type & BS_HEAP_TYPE) == BS_HEAP_BIG_WRITE ? "big" : (
|
||||
(wr->entry_type & BS_HEAP_TYPE) == BS_HEAP_INTENT_WRITE ? "intent" : (
|
||||
(wr->entry_type & BS_HEAP_TYPE) == BS_HEAP_TOMBSTONE ? "tombstone" : "unknown"))),
|
||||
(wr->entry_type & BS_HEAP_STABLE) ? "true" : "false"
|
||||
);
|
||||
if ((wr->entry_type & BS_HEAP_TYPE) == BS_HEAP_BIG_WRITE)
|
||||
{
|
||||
printf(",\"location\":%ju", wr->big_location(heap));
|
||||
}
|
||||
else if ((wr->entry_type & BS_HEAP_TYPE) == BS_HEAP_INTENT_WRITE)
|
||||
{
|
||||
printf(",\"offset\":%u,\"len\":%u", wr->small().offset, wr->small().len);
|
||||
}
|
||||
else if ((wr->entry_type & BS_HEAP_TYPE) == BS_HEAP_SMALL_WRITE)
|
||||
{
|
||||
if (!dump_with_data)
|
||||
{
|
||||
printf(",\"offset\":%u,\"len\":%u,\"location\":%ju", wr->small().offset, wr->small().len, wr->small().location);
|
||||
}
|
||||
else
|
||||
{
|
||||
printf(",\"data\":\"");
|
||||
for (uint32_t i = 0; i < wr->small().len; i++)
|
||||
printf("%02x", buffer_area[wr->small().location + i]);
|
||||
printf("\"");
|
||||
}
|
||||
}
|
||||
uint8_t* bitmap = wr->get_int_bitmap(heap);
|
||||
if (bitmap)
|
||||
{
|
||||
printf(",\"bitmap\":\"");
|
||||
for (uint64_t i = 0; i < dsk.clean_entry_bitmap_size; i++)
|
||||
printf("%02x", bitmap[i]);
|
||||
printf("\"");
|
||||
}
|
||||
bitmap = wr->get_ext_bitmap(heap);
|
||||
if (bitmap)
|
||||
{
|
||||
printf(",\"ext_bitmap\":\"");
|
||||
for (uint64_t i = 0; i < dsk.clean_entry_bitmap_size; i++)
|
||||
printf("%02x", bitmap[i]);
|
||||
printf("\"");
|
||||
}
|
||||
uint8_t *csums = wr->get_checksums(heap);
|
||||
if (csums)
|
||||
{
|
||||
printf(",\"block_csums\":\"");
|
||||
uint32_t csum_size = wr->get_csum_size(heap);
|
||||
for (uint32_t i = 0; i < csum_size; i++)
|
||||
printf("%02x", csums[i]);
|
||||
printf("\"");
|
||||
}
|
||||
if (wr->get_checksum(heap))
|
||||
{
|
||||
printf(",\"data_crc32c\":\"%08x\"", *wr->get_checksum(heap));
|
||||
}
|
||||
printf("}");
|
||||
first_wr = false;
|
||||
}
|
||||
printf("]}");
|
||||
first_entry = false;
|
||||
}
|
||||
|
||||
void disk_tool_t::dump_meta_entry(uint64_t block_num, clean_disk_entry *entry, uint8_t *bitmap)
|
||||
{
|
||||
printf(
|
||||
@@ -294,7 +523,7 @@ int disk_tool_t::write_json_meta(json11::Json meta)
|
||||
? BLOCKSTORE_CSUM_CRC32C
|
||||
: BLOCKSTORE_CSUM_NONE);
|
||||
new_hdr->csum_block_size = meta["csum_block_size"].uint64_value();
|
||||
new_hdr->header_csum = crc32c(0, new_hdr, sizeof(*new_hdr));
|
||||
new_hdr->header_csum = crc32c(0, new_hdr, sizeof(blockstore_meta_header_v2_t));
|
||||
}
|
||||
uint32_t new_clean_entry_header_size = (new_hdr->version == BLOCKSTORE_META_FORMAT_V1
|
||||
? sizeof(clean_disk_entry) : sizeof(clean_disk_entry) + 4 /*entry_csum*/);
|
||||
@@ -304,6 +533,7 @@ int disk_tool_t::write_json_meta(json11::Json meta)
|
||||
: 0);
|
||||
new_clean_entry_size = new_clean_entry_header_size + 2*new_clean_entry_bitmap_size + new_data_csum_size;
|
||||
new_entries_per_block = new_hdr->meta_block_size / new_clean_entry_size;
|
||||
// FIXME: Use a streaming json parser
|
||||
for (const auto & e: meta["entries"].array_items())
|
||||
{
|
||||
uint64_t data_block = e["block"].uint64_value();
|
||||
@@ -337,7 +567,379 @@ int disk_tool_t::write_json_meta(json11::Json meta)
|
||||
}
|
||||
}
|
||||
int r = resize_write_new_meta();
|
||||
free(new_meta_buf);
|
||||
new_meta_buf = NULL;
|
||||
free_new_meta();
|
||||
return r;
|
||||
}
|
||||
|
||||
int disk_tool_t::write_json_heap(json11::Json meta, json11::Json journal)
|
||||
{
|
||||
new_meta_hdr->zero = 0;
|
||||
new_meta_hdr->magic = BLOCKSTORE_META_MAGIC_V1;
|
||||
new_meta_hdr->version = BLOCKSTORE_META_FORMAT_HEAP;
|
||||
new_meta_hdr->meta_block_size = meta["meta_block_size"].uint64_value()
|
||||
? meta["meta_block_size"].uint64_value() : 4096;
|
||||
new_meta_hdr->data_block_size = meta["data_block_size"].uint64_value()
|
||||
? meta["data_block_size"].uint64_value() : 131072;
|
||||
new_meta_hdr->bitmap_granularity = meta["bitmap_granularity"].uint64_value()
|
||||
? meta["bitmap_granularity"].uint64_value() : 4096;
|
||||
new_meta_hdr->data_csum_type = meta["data_csum_type"].is_number()
|
||||
? meta["data_csum_type"].uint64_value()
|
||||
: (meta["data_csum_type"].string_value() == "crc32c"
|
||||
? BLOCKSTORE_CSUM_CRC32C
|
||||
: BLOCKSTORE_CSUM_NONE);
|
||||
new_meta_hdr->csum_block_size = meta["csum_block_size"].uint64_value();
|
||||
new_meta_hdr->header_csum = crc32c(0, new_meta_hdr, sizeof(blockstore_meta_header_v3_t));
|
||||
new_clean_entry_bitmap_size = (new_meta_hdr->data_block_size / new_meta_hdr->bitmap_granularity + 7) / 8;
|
||||
new_clean_entry_size = 0;
|
||||
new_entries_per_block = 0;
|
||||
new_data_csum_size = (new_meta_hdr->data_csum_type
|
||||
? ((new_meta_hdr->data_block_size+new_meta_hdr->csum_block_size-1)/new_meta_hdr->csum_block_size*(new_meta_hdr->data_csum_type & 0xFF))
|
||||
: 0);
|
||||
new_journal_buf = new_journal_len ? (uint8_t*)memalign(MEM_ALIGNMENT, new_journal_len) : NULL;
|
||||
if (new_journal_len)
|
||||
{
|
||||
memset(new_journal_buf, 0, new_journal_len);
|
||||
}
|
||||
uint64_t total_used_space = 0;
|
||||
uint32_t used_space = 0;
|
||||
// FIXME: Use a streaming json parser
|
||||
if (meta["version"] == "3.0")
|
||||
{
|
||||
// New format
|
||||
std::vector<uint8_t> object_buf;
|
||||
new_heap = new blockstore_heap_t(&dsk, new_journal_buf, 0);
|
||||
for (const auto & meta_entry: meta["entries"].array_items())
|
||||
{
|
||||
bool invalid = false;
|
||||
object_id oid = {
|
||||
.inode = (sscanf_json(NULL, meta_entry["pool"]) << (64-POOL_ID_BITS)) | sscanf_json(NULL, meta_entry["inode"]),
|
||||
.stripe = sscanf_json(NULL, meta_entry["stripe"]),
|
||||
};
|
||||
object_buf.clear();
|
||||
object_buf.resize(sizeof(heap_object_t));
|
||||
heap_object_t *obj = (heap_object_t*)object_buf.data();
|
||||
obj->size = sizeof(heap_object_t);
|
||||
obj->write_pos = meta_entry["writes"].array_items().size() ? sizeof(heap_object_t) : 0;
|
||||
obj->entry_type = BS_HEAP_OBJECT;
|
||||
obj->inode = oid.inode;
|
||||
obj->stripe = oid.stripe;
|
||||
size_t pos = sizeof(heap_object_t);
|
||||
heap_write_t *last_wr = NULL;
|
||||
for (auto & write_entry: meta_entry["writes"].array_items())
|
||||
{
|
||||
object_buf.resize(object_buf.size() + new_heap->get_max_write_entry_size());
|
||||
heap_write_t *wr = (heap_write_t*)(object_buf.data() + pos);
|
||||
last_wr = wr;
|
||||
uint8_t wr_type = 0;
|
||||
if (write_entry["type"] == "small")
|
||||
wr_type = BS_HEAP_SMALL_WRITE;
|
||||
else if (write_entry["type"] == "intent")
|
||||
wr_type = BS_HEAP_INTENT_WRITE;
|
||||
else if (write_entry["type"] == "big")
|
||||
wr_type = BS_HEAP_BIG_WRITE;
|
||||
else if (write_entry["type"] == "tombstone")
|
||||
wr_type = BS_HEAP_TOMBSTONE;
|
||||
else
|
||||
{
|
||||
fprintf(stderr, "Write entry in %s has invalid type: %s, skipping object\n", meta_entry.dump().c_str(), write_entry["type"].dump().c_str());
|
||||
invalid = true;
|
||||
break;
|
||||
}
|
||||
wr->entry_type = wr_type | (write_entry["stable"].bool_value() ? BS_HEAP_STABLE : 0);
|
||||
wr->lsn = write_entry["lsn"].uint64_value();
|
||||
wr->version = write_entry["version"].uint64_value();
|
||||
wr->size = wr->get_size(new_heap);
|
||||
wr->next_pos = wr->size;
|
||||
if (wr_type == BS_HEAP_SMALL_WRITE || wr_type == BS_HEAP_INTENT_WRITE)
|
||||
{
|
||||
wr->small().offset = write_entry["offset"].uint64_value();
|
||||
wr->small().len = write_entry["len"].uint64_value();
|
||||
wr->small().location = write_entry["location"].uint64_value();
|
||||
if (wr_type == BS_HEAP_SMALL_WRITE && write_entry["data"].is_string() && wr->small().len > 0)
|
||||
{
|
||||
if (!new_journal_buf)
|
||||
{
|
||||
fprintf(stderr, "Loading small write data requires overwriting buffer area\n");
|
||||
free_new_meta();
|
||||
return 1;
|
||||
}
|
||||
wr->small().location = new_heap->find_free_buffer_area(wr->small().len);
|
||||
fromhexstr(write_entry["data"].string_value(), wr->small().len, new_journal_buf + wr->small().location);
|
||||
}
|
||||
}
|
||||
else if (wr_type == BS_HEAP_BIG_WRITE)
|
||||
{
|
||||
uint64_t loc = write_entry["location"].uint64_value();
|
||||
assert(!(loc % dsk.data_block_size));
|
||||
assert((loc / dsk.data_block_size) < 0xFFFF0000);
|
||||
wr->set_big_location(new_heap, loc);
|
||||
}
|
||||
if (write_entry["bitmap"].is_string() && wr->get_int_bitmap(new_heap))
|
||||
{
|
||||
fromhexstr(write_entry["bitmap"].string_value(), new_clean_entry_bitmap_size, wr->get_int_bitmap(new_heap));
|
||||
}
|
||||
if (write_entry["ext_bitmap"].is_string() && wr->get_ext_bitmap(new_heap))
|
||||
{
|
||||
fromhexstr(write_entry["ext_bitmap"].string_value(), new_clean_entry_bitmap_size, wr->get_ext_bitmap(new_heap));
|
||||
}
|
||||
if (write_entry["block_csums"].is_string() && wr->get_checksums(new_heap))
|
||||
{
|
||||
fromhexstr(write_entry["block_csums"].string_value(), wr->get_csum_size(new_heap), wr->get_ext_bitmap(new_heap));
|
||||
}
|
||||
if (write_entry["data_crc32c"].is_string() && wr->get_checksum(new_heap))
|
||||
{
|
||||
*wr->get_checksum(new_heap) = sscanf_json("%jx", write_entry["data_crc32c"]);
|
||||
}
|
||||
}
|
||||
if (invalid)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
last_wr->next_pos = 0;
|
||||
new_heap->copy_object(obj, NULL);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if (!journal.is_array())
|
||||
{
|
||||
fprintf(stderr, "Metadata should include journal in you want to convert it to the \"heap\" format\n");
|
||||
close_err:
|
||||
free(new_meta_buf);
|
||||
new_meta_buf = NULL;
|
||||
return 1;
|
||||
}
|
||||
std::map<object_id, std::vector<json11::Json::object>> journal_by_object;
|
||||
if (index_journal_by_object(journal, journal_by_object) != 0)
|
||||
{
|
||||
goto close_err;
|
||||
}
|
||||
journal = json11::Json();
|
||||
// Convert old format to the new format
|
||||
uint64_t next_lsn = 0;
|
||||
uint64_t meta_offset = 0;
|
||||
const uint32_t space_per_object = sizeof(heap_object_t) + sizeof(heap_write_t) +
|
||||
new_clean_entry_bitmap_size*2 + new_data_csum_size;
|
||||
uint64_t buffer_pos = 0;
|
||||
// FIXME: Rather ugly. Remove the dependency on dsk from heap?
|
||||
blockstore_disk_t dsk;
|
||||
dsk.bitmap_granularity = new_meta_hdr->bitmap_granularity;
|
||||
dsk.block_count = 16;
|
||||
dsk.data_block_size = new_meta_hdr->data_block_size;
|
||||
dsk.clean_entry_bitmap_size = new_clean_entry_bitmap_size;
|
||||
dsk.csum_block_size = new_meta_hdr->csum_block_size;
|
||||
dsk.data_csum_type = new_meta_hdr->data_csum_type;
|
||||
dsk.journal_len = 4096;
|
||||
dsk.meta_area_size = new_meta_len;
|
||||
dsk.meta_block_size = new_meta_hdr->meta_block_size;
|
||||
dsk.meta_block_target_free_space = 800;
|
||||
blockstore_heap_t heap(&dsk, NULL, 0);
|
||||
for (const auto & meta_entry: meta["entries"].array_items())
|
||||
{
|
||||
object_id oid = {
|
||||
.inode = (sscanf_json(NULL, meta_entry["pool"]) << (64-POOL_ID_BITS)) | sscanf_json(NULL, meta_entry["inode"]),
|
||||
.stripe = sscanf_json(NULL, meta_entry["stripe"]),
|
||||
};
|
||||
uint32_t space_for_this = space_per_object;
|
||||
auto j_it = journal_by_object.find(oid);
|
||||
if (j_it != journal_by_object.end())
|
||||
{
|
||||
for (auto & rec: j_it->second)
|
||||
{
|
||||
if (rec["type"] == "small_write" || rec["type"] == "small_write_instant")
|
||||
{
|
||||
uint64_t off = rec["offset"].uint64_value();
|
||||
uint64_t len = rec["len"].uint64_value();
|
||||
if (off+len > new_meta_hdr->data_block_size)
|
||||
{
|
||||
fprintf(stderr, "Journal entry has too large offset or length: %s\n", json11::Json(rec).dump().c_str());
|
||||
goto close_err;
|
||||
}
|
||||
space_for_this += sizeof(heap_write_t) + new_clean_entry_bitmap_size +
|
||||
((off+len+new_meta_hdr->csum_block_size-1)/new_meta_hdr->csum_block_size - off/new_meta_hdr->csum_block_size) * (new_meta_hdr->data_csum_type & 0xFF);
|
||||
}
|
||||
else /*if (rec["type"] == "big_write" || rec["type"] == "big_write_instant")*/
|
||||
{
|
||||
space_for_this += sizeof(heap_write_t) + 2*new_clean_entry_bitmap_size + new_data_csum_size;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (space_for_this > new_meta_hdr->meta_block_size)
|
||||
{
|
||||
fprintf(stderr, "Object doesn't fit in a single metadata block. Object meta: %s, object journal: %s\n",
|
||||
meta_entry.dump().c_str(), json11::Json(j_it->second).dump().c_str());
|
||||
goto close_err;
|
||||
}
|
||||
if (used_space + space_for_this > new_meta_hdr->meta_block_size-dsk.meta_block_target_free_space)
|
||||
{
|
||||
if (used_space < new_meta_hdr->meta_block_size-2)
|
||||
{
|
||||
*((uint16_t*)(new_meta_buf + meta_offset + used_space)) = FREE_SPACE_BIT | (uint16_t)(new_meta_hdr->meta_block_size-used_space);
|
||||
}
|
||||
meta_offset += new_meta_hdr->meta_block_size;
|
||||
used_space = 0;
|
||||
if (meta_offset >= new_meta_len)
|
||||
{
|
||||
fprintf(stderr, "Metadata doesn't fit into the new area (total used space: %ju, minimum free space in block: %u/%u)\n",
|
||||
total_used_space, dsk.meta_block_target_free_space, new_meta_hdr->meta_block_size);
|
||||
goto close_err;
|
||||
}
|
||||
}
|
||||
heap_object_t *obj = (heap_object_t*)(new_meta_buf + meta_offset + used_space);
|
||||
obj->size = sizeof(heap_object_t);
|
||||
obj->write_pos = sizeof(heap_object_t);
|
||||
obj->entry_type = BS_HEAP_OBJECT;
|
||||
obj->inode = oid.inode;
|
||||
obj->stripe = oid.stripe;
|
||||
heap_write_t *wr = obj->get_writes();
|
||||
wr->next_pos = 0;
|
||||
wr->entry_type = BS_HEAP_BIG_WRITE|BS_HEAP_STABLE;
|
||||
wr->lsn = ++next_lsn;
|
||||
wr->version = sscanf_json(NULL, meta_entry["version"]);
|
||||
wr->set_big_location(&heap, meta_entry["block"].uint64_value() * new_meta_hdr->data_block_size);
|
||||
wr->size = wr->get_size(&heap);
|
||||
fromhexstr(meta_entry["bitmap"].string_value(), new_clean_entry_bitmap_size, wr->get_int_bitmap(&heap));
|
||||
fromhexstr(meta_entry["ext_bitmap"].string_value(), new_clean_entry_bitmap_size, wr->get_ext_bitmap(&heap));
|
||||
if (new_meta_hdr->data_csum_type != 0)
|
||||
fromhexstr(meta_entry["data_csum"].string_value(), new_data_csum_size, wr->get_checksums(&heap));
|
||||
if (j_it != journal_by_object.end())
|
||||
{
|
||||
for (auto & rec: j_it->second)
|
||||
{
|
||||
wr->next_pos = wr->get_size(&heap);
|
||||
wr = wr->next();
|
||||
wr->next_pos = 0;
|
||||
wr->lsn = ++next_lsn;
|
||||
wr->version = rec["ver"].uint64_value();
|
||||
uint64_t wr_offset = rec["offset"].uint64_value();
|
||||
uint64_t wr_len = rec["len"].uint64_value();
|
||||
if (rec["type"] == "small_write" || rec["type"] == "small_write_instant")
|
||||
{
|
||||
if (wr_len > 0 && !rec["data"].is_string())
|
||||
{
|
||||
fprintf(stderr, "Error: entry data is missing, please generate the dump with --json --format data\n");
|
||||
goto close_err;
|
||||
}
|
||||
wr->entry_type = BS_HEAP_SMALL_WRITE | (rec["type"] == "small_write_instant" ? BS_HEAP_STABLE : 0);
|
||||
wr->small().offset = wr_offset;
|
||||
wr->small().len = wr_len;
|
||||
wr->small().location = buffer_pos;
|
||||
fromhexstr(rec["bitmap"].string_value(), new_clean_entry_bitmap_size, wr->get_ext_bitmap(&heap));
|
||||
fromhexstr(rec["data"].string_value(), wr_len, new_journal_buf+buffer_pos);
|
||||
if (wr_len > 0)
|
||||
{
|
||||
if (!new_meta_hdr->data_csum_type)
|
||||
*wr->get_checksum(&heap) = crc32c(0, new_journal_buf+buffer_pos, wr_len);
|
||||
else
|
||||
heap.calc_block_checksums((uint32_t*)wr->get_checksums(&heap), new_journal_buf+buffer_pos, NULL, wr_offset, wr_offset+wr_len, true, NULL);
|
||||
}
|
||||
buffer_pos += wr_len;
|
||||
}
|
||||
else if (rec["type"] == "big_write" || rec["type"] == "big_write_instant")
|
||||
{
|
||||
wr->entry_type = BS_HEAP_BIG_WRITE | (rec["type"] == "big_write_instant" ? BS_HEAP_STABLE : 0);
|
||||
wr->set_big_location(&heap, sscanf_json(NULL, rec["loc"]));
|
||||
bitmap_set(wr->get_int_bitmap(&heap), wr_offset, wr_len, new_meta_hdr->bitmap_granularity);
|
||||
fromhexstr(rec["bitmap"].string_value(), new_clean_entry_bitmap_size, wr->get_ext_bitmap(&heap));
|
||||
if (new_meta_hdr->data_csum_type != 0)
|
||||
{
|
||||
if ((wr_offset % new_meta_hdr->csum_block_size) || (wr_len % new_meta_hdr->csum_block_size))
|
||||
{
|
||||
fprintf(stderr,
|
||||
"Error: big_write journal entries not aligned to csum_block_size can't be converted between v0.9 and v3.0 metadata\n"
|
||||
"Stop writes and flush the journal or convert OSDs one by one without the journal if you still want to do it.\n");
|
||||
goto close_err;
|
||||
}
|
||||
fromhexstr(rec["block_csums"].string_value(),
|
||||
((wr_offset+wr_len+new_meta_hdr->csum_block_size-1)/new_meta_hdr->csum_block_size
|
||||
- wr_offset/new_meta_hdr->csum_block_size) * (new_meta_hdr->data_csum_type & 0xFF),
|
||||
wr->get_checksums(&heap));
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
assert(0);
|
||||
}
|
||||
wr->size = wr->get_size(&heap);
|
||||
}
|
||||
}
|
||||
obj->crc32c = obj->calc_crc32c();
|
||||
assert(((uint8_t*)wr + wr->size - (uint8_t*)obj) == space_for_this);
|
||||
used_space += space_for_this;
|
||||
total_used_space += space_for_this;
|
||||
}
|
||||
if (used_space > 0 && used_space < new_meta_hdr->meta_block_size-2)
|
||||
{
|
||||
*((uint16_t*)(new_meta_buf + meta_offset + used_space)) = FREE_SPACE_BIT | (uint16_t)(new_meta_hdr->meta_block_size-used_space);
|
||||
}
|
||||
}
|
||||
int r = resize_write_new_meta();
|
||||
if (r == 0)
|
||||
{
|
||||
r = resize_write_new_journal();
|
||||
}
|
||||
free_new_meta();
|
||||
return r;
|
||||
}
|
||||
|
||||
int disk_tool_t::index_journal_by_object(json11::Json journal,
|
||||
std::map<object_id, std::vector<json11::Json::object>> & journal_by_object)
|
||||
{
|
||||
for (const auto & rec: journal.array_items())
|
||||
{
|
||||
object_id oid = {
|
||||
.inode = sscanf_json(NULL, rec["inode"]),
|
||||
.stripe = sscanf_json(NULL, rec["stripe"]),
|
||||
};
|
||||
auto & jbo = journal_by_object[oid];
|
||||
if (rec["type"] == "small_write" || rec["type"] == "small_write_instant")
|
||||
{
|
||||
jbo.push_back(rec.object_items());
|
||||
}
|
||||
else if (rec["type"] == "big_write" || rec["type"] == "big_write_instant")
|
||||
{
|
||||
if (rec["type"] == "big_write_instant")
|
||||
jbo.clear();
|
||||
jbo.push_back(rec.object_items());
|
||||
}
|
||||
else if (rec["type"] == "delete")
|
||||
{
|
||||
jbo.clear();
|
||||
}
|
||||
else if (rec["type"] == "stable")
|
||||
{
|
||||
uint64_t commit_to = rec["version"].uint64_value();
|
||||
for (size_t i = 0; i < jbo.size(); i++)
|
||||
{
|
||||
if (jbo[i]["version"].uint64_value() <= commit_to)
|
||||
{
|
||||
if (jbo[i]["type"] == "big_write")
|
||||
{
|
||||
jbo.erase(jbo.begin(), jbo.begin()+i);
|
||||
i = 0;
|
||||
jbo[i]["type"] = "big_write_instant";
|
||||
}
|
||||
else if (jbo[i]["type"] == "small_write")
|
||||
{
|
||||
jbo[i]["type"] = "small_write_instant";
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
else if (rec["type"] == "rollback")
|
||||
{
|
||||
uint64_t rollback_to = rec["version"].uint64_value();
|
||||
for (size_t i = jbo.size()-1; i >= 0; i--)
|
||||
{
|
||||
if (jbo[i]["version"].uint64_value() > rollback_to)
|
||||
jbo.erase(jbo.begin()+i, jbo.begin()+i+1);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
fprintf(stderr, "Unknown journal entry type: %s\n", rec.dump().c_str());
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
@@ -9,6 +9,7 @@
|
||||
int disk_tool_t::prepare_one(std::map<std::string, std::string> options, int is_hdd, json11::Json::object & result)
|
||||
{
|
||||
static const char *allow_additional_params[] = {
|
||||
"meta_format",
|
||||
"data_csum_type",
|
||||
"csum_block_size",
|
||||
"autosync_writes",
|
||||
@@ -72,6 +73,10 @@ int disk_tool_t::prepare_one(std::map<std::string, std::string> options, int is_
|
||||
options["disable_"+dev+"_fsync"] = "1";
|
||||
}
|
||||
}
|
||||
if (options["meta_format"] == "")
|
||||
{
|
||||
options["meta_format"] = std::to_string(BLOCKSTORE_META_FORMAT_HEAP);
|
||||
}
|
||||
if (options["meta_device"] == "" || options["meta_device"] == options["data_device"])
|
||||
{
|
||||
options["disable_meta_fsync"] = options["disable_data_fsync"];
|
||||
@@ -108,35 +113,40 @@ int disk_tool_t::prepare_one(std::map<std::string, std::string> options, int is_
|
||||
if (options.find("autosync_writes") == options.end())
|
||||
options["autosync_writes"] = "512";
|
||||
}
|
||||
uint64_t new_meta_len = parse_size(options["meta_len"]);
|
||||
json11::Json::object sb;
|
||||
blockstore_disk_t dsk;
|
||||
try
|
||||
{
|
||||
dsk.parse_config(options);
|
||||
// Set all offsets to 4096 to calculate metadata size with excess
|
||||
// Calculate metadata sizes (with excess)
|
||||
dsk.journal_offset = 4096;
|
||||
dsk.meta_offset = 4096;
|
||||
dsk.data_offset = 4096;
|
||||
dsk.meta_offset = 4096 + (dsk.meta_device == dsk.journal_device ? dsk.cfg_journal_size : 0);
|
||||
dsk.data_offset = 4096 + (dsk.data_device == dsk.meta_device && new_meta_len ? new_meta_len : 0) +
|
||||
(dsk.data_device == dsk.journal_device ? dsk.cfg_journal_size : 0);
|
||||
dsk.data_io = dsk.meta_io = dsk.journal_io = (options["io"] == "cached" ? "cached" : "direct");
|
||||
dsk.open_data();
|
||||
dsk.open_meta();
|
||||
dsk.open_journal();
|
||||
dsk.calc_lengths(true);
|
||||
dsk.calc_lengths();
|
||||
if (dsk.data_device == dsk.meta_device && !new_meta_len)
|
||||
dsk.data_offset += (dsk.meta_format == BLOCKSTORE_META_FORMAT_HEAP ? dsk.min_meta_len*2 : dsk.min_meta_len);
|
||||
dsk.meta_area_size = (dsk.data_device == dsk.meta_device ? dsk.data_offset : dsk.meta_device_size) - dsk.meta_offset;
|
||||
sb = json11::Json::object {
|
||||
{ "meta_format", options["meta_format"] },
|
||||
{ "data_device", options["data_device"] },
|
||||
{ "meta_device", options["meta_device"] },
|
||||
{ "journal_device", options["journal_device"] },
|
||||
{ "block_size", (uint64_t)dsk.data_block_size },
|
||||
{ "meta_block_size", dsk.meta_block_size },
|
||||
{ "journal_block_size", dsk.journal_block_size },
|
||||
{ "meta_block_size", (uint64_t)dsk.meta_block_size },
|
||||
{ "journal_block_size", (uint64_t)dsk.journal_block_size },
|
||||
{ "data_size", dsk.cfg_data_size },
|
||||
{ "disk_alignment", (uint64_t)dsk.disk_alignment },
|
||||
{ "bitmap_granularity", dsk.bitmap_granularity },
|
||||
{ "bitmap_granularity", (uint64_t)dsk.bitmap_granularity },
|
||||
{ "disable_device_lock", dsk.disable_flock },
|
||||
{ "journal_offset", 4096 },
|
||||
{ "meta_offset", 4096 + (dsk.meta_device == dsk.journal_device ? dsk.journal_len : 0) },
|
||||
{ "data_offset", 4096 + (dsk.data_device == dsk.meta_device ? dsk.meta_len : 0) +
|
||||
(dsk.data_device == dsk.journal_device ? dsk.journal_len : 0) },
|
||||
{ "meta_offset", dsk.meta_offset },
|
||||
{ "data_offset", dsk.data_offset },
|
||||
{ "journal_no_same_sector_overwrites", !is_hdd || is_hybrid },
|
||||
{ "journal_sector_buffer_count", 1024 },
|
||||
{ "disable_data_fsync", json_is_true(options["disable_data_fsync"]) },
|
||||
@@ -180,7 +190,7 @@ int disk_tool_t::prepare_one(std::map<std::string, std::string> options, int is_
|
||||
}
|
||||
sb["osd_num"] = osd_num;
|
||||
// Zero out metadata and journal
|
||||
if (write_zero(dsk.meta_fd, sb["meta_offset"].uint64_value(), dsk.meta_len) != 0 ||
|
||||
if (write_zero(dsk.meta_fd, sb["meta_offset"].uint64_value(), dsk.meta_area_size) != 0 ||
|
||||
write_zero(dsk.journal_fd, sb["journal_offset"].uint64_value(), dsk.journal_len) != 0)
|
||||
{
|
||||
fprintf(stderr, "Failed to zero out metadata or journal: %s\n", strerror(errno));
|
||||
@@ -435,7 +445,7 @@ json11::Json disk_tool_t::add_partitions(vitastor_dev_info_t & devinfo, std::vec
|
||||
}
|
||||
|
||||
std::vector<std::string> disk_tool_t::get_new_data_parts(vitastor_dev_info_t & dev,
|
||||
uint64_t osd_per_disk, uint64_t max_other_percent)
|
||||
uint64_t osd_per_disk, uint64_t max_other_percent, uint64_t *check_new_count)
|
||||
{
|
||||
std::vector<std::string> use_parts;
|
||||
uint64_t want_parts = 0;
|
||||
@@ -457,7 +467,6 @@ std::vector<std::string> disk_tool_t::get_new_data_parts(vitastor_dev_info_t & d
|
||||
{
|
||||
// Use this partition
|
||||
use_parts.push_back(part["uuid"].string_value());
|
||||
osds_exist++;
|
||||
}
|
||||
else
|
||||
{
|
||||
@@ -480,9 +489,21 @@ std::vector<std::string> disk_tool_t::get_new_data_parts(vitastor_dev_info_t & d
|
||||
}
|
||||
// Still create OSD(s) if a disk has no more than (max_other_percent) other data
|
||||
if (osds_exist >= osd_per_disk || (dev.free+osds_size) < dev.size*(100-max_other_percent)/100)
|
||||
{
|
||||
fprintf(stderr, "%s is already partitioned, skipping\n", dev.path.c_str());
|
||||
use_parts.clear();
|
||||
}
|
||||
else
|
||||
want_parts = osd_per_disk-osds_exist;
|
||||
{
|
||||
if (use_parts.size() >= osd_per_disk-osds_exist)
|
||||
use_parts.resize(osd_per_disk-osds_exist);
|
||||
want_parts = osd_per_disk-osds_exist-use_parts.size();
|
||||
}
|
||||
}
|
||||
if (check_new_count)
|
||||
{
|
||||
*check_new_count = want_parts;
|
||||
return use_parts;
|
||||
}
|
||||
if (want_parts > 0)
|
||||
{
|
||||
@@ -516,9 +537,9 @@ int disk_tool_t::get_meta_partition(std::vector<vitastor_dev_info_t> & ssds, std
|
||||
dsk.open_data();
|
||||
dsk.open_meta();
|
||||
dsk.open_journal();
|
||||
dsk.calc_lengths(true);
|
||||
dsk.calc_lengths();
|
||||
dsk.close_all();
|
||||
meta_size = dsk.meta_len;
|
||||
meta_size = dsk.min_meta_len;
|
||||
}
|
||||
catch (std::exception & e)
|
||||
{
|
||||
@@ -684,10 +705,25 @@ int disk_tool_t::prepare(std::vector<std::string> devices)
|
||||
}
|
||||
json11::Json::array all_results, errors;
|
||||
auto journal_size = options["journal_size"];
|
||||
if (options.find("dry_run") != options.end())
|
||||
{
|
||||
json11::Json::array results;
|
||||
for (auto & dev: devinfo)
|
||||
{
|
||||
uint64_t new_part_count = 0;
|
||||
auto existing_part_count = get_new_data_parts(dev, osd_per_disk, max_other_percent, &new_part_count).size();
|
||||
results.push_back(json11::Json::object{ { "device_path", dev.path }, { "new_osd_count", existing_part_count+new_part_count } });
|
||||
if (!json && new_part_count+existing_part_count > 0)
|
||||
printf("Will initialize %ju OSD(s) on %s\n", existing_part_count+new_part_count, dev.path.c_str());
|
||||
}
|
||||
if (json)
|
||||
printf("%s\n", json11::Json(json11::Json::object{{ "devices", results }}).dump().c_str());
|
||||
return 0;
|
||||
}
|
||||
for (auto & dev: devinfo)
|
||||
{
|
||||
// Select new partitions and create an OSD on each of them
|
||||
for (const auto & uuid: get_new_data_parts(dev, osd_per_disk, max_other_percent))
|
||||
for (const auto & uuid: get_new_data_parts(dev, osd_per_disk, max_other_percent, NULL))
|
||||
{
|
||||
options["force"] = true;
|
||||
options["data_device"] = "/dev/disk/by-partuuid/"+strtolower(uuid);
|
||||
|
@@ -1,6 +1,9 @@
|
||||
// Copyright (c) Vitaliy Filippov, 2019+
|
||||
// License: VNPL-1.1 (see README.md for details)
|
||||
|
||||
#define _XOPEN_SOURCE
|
||||
#include <limits.h>
|
||||
|
||||
#include "disk_tool.h"
|
||||
#include "rw_blocking.h"
|
||||
#include "str_util.h"
|
||||
@@ -24,65 +27,87 @@ int disk_tool_t::raw_resize()
|
||||
// Parse parameters
|
||||
r = resize_parse_params();
|
||||
if (r != 0)
|
||||
return r;
|
||||
goto ret;
|
||||
// Fill allocator
|
||||
fprintf(stderr, "Reading metadata\n");
|
||||
data_alloc = new allocator_t((new_data_len < dsk.data_len ? dsk.data_len : new_data_len) / dsk.data_block_size);
|
||||
r = process_meta(
|
||||
[this](blockstore_meta_header_v2_t *hdr)
|
||||
[this](blockstore_meta_header_v3_t *hdr)
|
||||
{
|
||||
resize_init(hdr);
|
||||
},
|
||||
[this](blockstore_heap_t *heap, heap_object_t *obj, uint32_t meta_block_num)
|
||||
{
|
||||
for (auto wr = obj->get_writes(); wr; wr = wr->next())
|
||||
{
|
||||
if ((wr->entry_type & BS_HEAP_TYPE) == BS_HEAP_BIG_WRITE)
|
||||
{
|
||||
data_alloc->set(wr->big().block_num, true);
|
||||
}
|
||||
}
|
||||
},
|
||||
[this](uint64_t block_num, clean_disk_entry *entry, uint8_t *bitmap)
|
||||
{
|
||||
data_alloc->set(block_num, true);
|
||||
}
|
||||
},
|
||||
true, true
|
||||
);
|
||||
if (r != 0)
|
||||
return r;
|
||||
fprintf(stderr, "Reading journal\n");
|
||||
r = process_journal([this](void *buf)
|
||||
goto ret;
|
||||
if (dsk.meta_format != BLOCKSTORE_META_FORMAT_HEAP)
|
||||
{
|
||||
return process_journal_block(buf, [this](int num, journal_entry *je)
|
||||
fprintf(stderr, "Reading journal\n");
|
||||
r = process_journal([this](void *buf)
|
||||
{
|
||||
if (je->type == JE_BIG_WRITE || je->type == JE_BIG_WRITE_INSTANT)
|
||||
return process_journal_block(buf, [this](int num, journal_entry *je)
|
||||
{
|
||||
data_alloc->set(je->big_write.location / dsk.data_block_size, true);
|
||||
}
|
||||
if (je->type == JE_BIG_WRITE || je->type == JE_BIG_WRITE_INSTANT)
|
||||
{
|
||||
data_alloc->set(je->big_write.location / dsk.data_block_size, true);
|
||||
}
|
||||
});
|
||||
});
|
||||
});
|
||||
if (r != 0)
|
||||
return r;
|
||||
if (r != 0)
|
||||
goto ret;
|
||||
}
|
||||
// Remap blocks
|
||||
r = resize_remap_blocks();
|
||||
if (r != 0)
|
||||
return r;
|
||||
goto ret;
|
||||
// Copy data blocks into new places
|
||||
fprintf(stderr, "Moving data blocks\n");
|
||||
r = resize_copy_data();
|
||||
if (r != 0)
|
||||
return r;
|
||||
// Rewrite journal
|
||||
fprintf(stderr, "Rebuilding journal\n");
|
||||
r = resize_rewrite_journal();
|
||||
if (r != 0)
|
||||
return r;
|
||||
goto ret;
|
||||
// Rewrite metadata
|
||||
resize_alloc_journal();
|
||||
fprintf(stderr, "Rebuilding metadata\n");
|
||||
r = resize_rewrite_meta();
|
||||
r = resize_rebuild_meta();
|
||||
if (r != 0)
|
||||
return r;
|
||||
goto ret;
|
||||
if (new_meta_format != BLOCKSTORE_META_FORMAT_HEAP)
|
||||
{
|
||||
// Rewrite journal
|
||||
fprintf(stderr, "Rebuilding journal\n");
|
||||
r = resize_rebuild_journal();
|
||||
if (r != 0)
|
||||
goto ret;
|
||||
fprintf(stderr, "Writing new journal\n");
|
||||
}
|
||||
else
|
||||
fprintf(stderr, "Writing new buffer area\n");
|
||||
// Write new journal
|
||||
fprintf(stderr, "Writing new journal\n");
|
||||
r = resize_write_new_journal();
|
||||
if (r != 0)
|
||||
return r;
|
||||
goto ret;
|
||||
// Write new metadata
|
||||
fprintf(stderr, "Writing new metadata\n");
|
||||
r = resize_write_new_meta();
|
||||
if (r != 0)
|
||||
return r;
|
||||
goto ret;
|
||||
fprintf(stderr, "Done\n");
|
||||
ret:
|
||||
free_new_meta();
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -126,6 +151,8 @@ int disk_tool_t::resize_parse_params()
|
||||
? parse_size(options["new_journal_offset"]) : dsk.journal_offset;
|
||||
new_journal_len = options.find("new_journal_len") != options.end()
|
||||
? parse_size(options["new_journal_len"]) : dsk.journal_len;
|
||||
new_meta_format = options.find("new_meta_format") != options.end()
|
||||
? stoull_full(options["new_meta_format"]) : 0;
|
||||
if (new_data_len+new_data_offset > dsk.data_device_size)
|
||||
new_data_len = dsk.data_device_size-new_data_offset;
|
||||
if (new_meta_device == dsk.data_device && new_data_offset < new_meta_offset &&
|
||||
@@ -139,7 +166,7 @@ int disk_tool_t::resize_parse_params()
|
||||
new_data_offset == dsk.data_offset &&
|
||||
new_data_len == dsk.data_len &&
|
||||
new_meta_offset == dsk.meta_offset &&
|
||||
(new_meta_len == dsk.meta_len || new_meta_len == 0) &&
|
||||
(new_meta_len == dsk.meta_area_size || new_meta_len == 0) &&
|
||||
new_journal_offset == dsk.journal_offset &&
|
||||
new_journal_len == dsk.journal_len &&
|
||||
options.find("force") == options.end())
|
||||
@@ -151,7 +178,7 @@ int disk_tool_t::resize_parse_params()
|
||||
return 0;
|
||||
}
|
||||
|
||||
void disk_tool_t::resize_init(blockstore_meta_header_v2_t *hdr)
|
||||
void disk_tool_t::resize_init(blockstore_meta_header_v3_t *hdr)
|
||||
{
|
||||
if (hdr && dsk.data_block_size != hdr->data_block_size)
|
||||
{
|
||||
@@ -170,6 +197,15 @@ void disk_tool_t::resize_init(blockstore_meta_header_v2_t *hdr)
|
||||
dsk.data_csum_type = hdr->data_csum_type;
|
||||
dsk.csum_block_size = hdr->csum_block_size;
|
||||
}
|
||||
if (hdr && dsk.meta_format != hdr->version)
|
||||
{
|
||||
dsk.meta_format = hdr->version;
|
||||
}
|
||||
if (new_meta_format == 0)
|
||||
{
|
||||
new_meta_format = hdr && hdr->version == BLOCKSTORE_META_FORMAT_HEAP ? BLOCKSTORE_META_FORMAT_HEAP : BLOCKSTORE_META_FORMAT_V2;
|
||||
}
|
||||
dsk.calc_lengths();
|
||||
if (((new_data_offset-dsk.data_offset) % dsk.data_block_size))
|
||||
{
|
||||
fprintf(stderr, "Data alignment mismatch: old data offset is 0x%jx, new is 0x%jx, but alignment on %x should be equal\n",
|
||||
@@ -359,15 +395,57 @@ int disk_tool_t::resize_copy_data()
|
||||
return 0;
|
||||
}
|
||||
|
||||
int disk_tool_t::resize_rewrite_journal()
|
||||
void disk_tool_t::resize_alloc_journal()
|
||||
{
|
||||
// Simply overwriting on the fly may be impossible because old and new areas may overlap
|
||||
// For now, just build new journal data in memory
|
||||
new_journal_buf = (uint8_t*)memalign_or_die(MEM_ALIGNMENT, new_journal_len);
|
||||
memset(new_journal_buf, 0, new_journal_len);
|
||||
new_journal_ptr = new_journal_buf;
|
||||
new_journal_data = new_journal_ptr + dsk.journal_block_size;
|
||||
new_journal_in_pos = 0;
|
||||
memset(new_journal_buf, 0, new_journal_len);
|
||||
}
|
||||
|
||||
void disk_tool_t::build_journal_start()
|
||||
{
|
||||
journal_entry *ne = (journal_entry*)(new_journal_ptr + new_journal_in_pos);
|
||||
*((journal_entry_start*)ne) = (journal_entry_start){
|
||||
.magic = JOURNAL_MAGIC,
|
||||
.type = JE_START,
|
||||
.size = sizeof(journal_entry_start),
|
||||
.journal_start = dsk.journal_block_size,
|
||||
.version = JOURNAL_VERSION_V2,
|
||||
.data_csum_type = dsk.data_csum_type,
|
||||
.csum_block_size = dsk.csum_block_size,
|
||||
};
|
||||
ne->crc32 = je_crc32(ne);
|
||||
new_journal_ptr += dsk.journal_block_size;
|
||||
new_journal_data = new_journal_ptr+dsk.journal_block_size;
|
||||
new_journal_in_pos = 0;
|
||||
}
|
||||
|
||||
void disk_tool_t::choose_journal_block(uint32_t je_size)
|
||||
{
|
||||
if (dsk.journal_block_size < new_journal_in_pos+je_size)
|
||||
{
|
||||
new_journal_ptr = new_journal_data;
|
||||
if (new_journal_ptr-new_journal_buf >= new_journal_len)
|
||||
{
|
||||
fprintf(stderr, "Error: live entries don't fit to the new journal\n");
|
||||
exit(1);
|
||||
}
|
||||
new_journal_data = new_journal_ptr+dsk.journal_block_size;
|
||||
new_journal_in_pos = 0;
|
||||
if (dsk.journal_block_size < je_size)
|
||||
{
|
||||
fprintf(stderr, "Error: journal entry too large (%u bytes)\n", je_size);
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
int disk_tool_t::resize_rebuild_journal()
|
||||
{
|
||||
// Simply overwriting on the fly may be impossible because old and new areas may overlap
|
||||
// For now, just build new journal data in memory
|
||||
process_journal([this](void *buf)
|
||||
{
|
||||
return process_journal_block(buf, [this](int num, journal_entry *je)
|
||||
@@ -384,39 +462,11 @@ int disk_tool_t::resize_rewrite_journal()
|
||||
);
|
||||
exit(1);
|
||||
}
|
||||
journal_entry *ne = (journal_entry*)(new_journal_ptr + new_journal_in_pos);
|
||||
*((journal_entry_start*)ne) = (journal_entry_start){
|
||||
.magic = JOURNAL_MAGIC,
|
||||
.type = JE_START,
|
||||
.size = sizeof(journal_entry_start),
|
||||
.journal_start = dsk.journal_block_size,
|
||||
.version = JOURNAL_VERSION_V2,
|
||||
.data_csum_type = dsk.data_csum_type,
|
||||
.csum_block_size = dsk.csum_block_size,
|
||||
};
|
||||
ne->crc32 = je_crc32(ne);
|
||||
new_journal_ptr += dsk.journal_block_size;
|
||||
new_journal_data = new_journal_ptr+dsk.journal_block_size;
|
||||
new_journal_in_pos = 0;
|
||||
build_journal_start();
|
||||
}
|
||||
else
|
||||
{
|
||||
if (dsk.journal_block_size < new_journal_in_pos+je->size)
|
||||
{
|
||||
new_journal_ptr = new_journal_data;
|
||||
if (new_journal_ptr-new_journal_buf >= new_journal_len)
|
||||
{
|
||||
fprintf(stderr, "Error: live entries don't fit to the new journal\n");
|
||||
exit(1);
|
||||
}
|
||||
new_journal_data = new_journal_ptr+dsk.journal_block_size;
|
||||
new_journal_in_pos = 0;
|
||||
if (dsk.journal_block_size < je->size)
|
||||
{
|
||||
fprintf(stderr, "Error: journal entry too large (%u bytes)\n", je->size);
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
choose_journal_block(je->size);
|
||||
journal_entry *ne = (journal_entry*)(new_journal_ptr + new_journal_in_pos);
|
||||
memcpy(ne, je, je->size);
|
||||
ne->crc32_prev = new_crc32_prev;
|
||||
@@ -463,30 +513,170 @@ int disk_tool_t::resize_write_new_journal()
|
||||
fsync(new_journal_fd);
|
||||
close(new_journal_fd);
|
||||
new_journal_fd = -1;
|
||||
free(new_journal_buf);
|
||||
new_journal_buf = NULL;
|
||||
return 0;
|
||||
}
|
||||
|
||||
int disk_tool_t::resize_rewrite_meta()
|
||||
int disk_tool_t::resize_rebuild_meta()
|
||||
{
|
||||
new_meta_buf = (uint8_t*)memalign_or_die(MEM_ALIGNMENT, new_meta_len);
|
||||
memset(new_meta_buf, 0, new_meta_len);
|
||||
if (new_meta_format == BLOCKSTORE_META_FORMAT_HEAP)
|
||||
{
|
||||
new_dsk = dsk;
|
||||
new_dsk.data_offset = new_data_offset;
|
||||
new_dsk.data_len = new_data_len;
|
||||
new_dsk.block_count = new_data_len / dsk.data_block_size;
|
||||
new_dsk.journal_device = new_journal_device;
|
||||
new_dsk.journal_offset = new_journal_offset;
|
||||
new_dsk.journal_len = new_journal_len;
|
||||
new_dsk.meta_device = new_meta_device;
|
||||
new_dsk.meta_offset = new_meta_offset;
|
||||
new_dsk.meta_area_size = new_meta_len;
|
||||
new_dsk.meta_format = new_meta_format;
|
||||
new_heap = new blockstore_heap_t(&new_dsk, NULL, 0);
|
||||
new_meta_hdr = (blockstore_meta_header_v3_t *)memalign_or_die(MEM_ALIGNMENT, dsk.meta_block_size);
|
||||
memset(new_meta_hdr, 0, dsk.meta_block_size);
|
||||
}
|
||||
else
|
||||
{
|
||||
new_meta_buf = (uint8_t*)memalign_or_die(MEM_ALIGNMENT, new_meta_len);
|
||||
memset(new_meta_buf, 0, new_meta_len);
|
||||
new_meta_hdr = (blockstore_meta_header_v3_t *)new_meta_buf;
|
||||
}
|
||||
std::vector<heap_write_t*> writes;
|
||||
int r = process_meta(
|
||||
[this](blockstore_meta_header_v2_t *hdr)
|
||||
[&](blockstore_meta_header_v3_t *hdr)
|
||||
{
|
||||
blockstore_meta_header_v2_t *new_hdr = (blockstore_meta_header_v2_t *)new_meta_buf;
|
||||
new_hdr->zero = 0;
|
||||
new_hdr->magic = BLOCKSTORE_META_MAGIC_V1;
|
||||
new_hdr->version = BLOCKSTORE_META_FORMAT_V2;
|
||||
new_hdr->meta_block_size = dsk.meta_block_size;
|
||||
new_hdr->data_block_size = dsk.data_block_size;
|
||||
new_hdr->bitmap_granularity = dsk.bitmap_granularity ? dsk.bitmap_granularity : 4096;
|
||||
new_hdr->data_csum_type = dsk.data_csum_type;
|
||||
new_hdr->csum_block_size = dsk.csum_block_size;
|
||||
new_hdr->header_csum = crc32c(0, new_hdr, sizeof(*new_hdr));
|
||||
new_meta_hdr->zero = 0;
|
||||
new_meta_hdr->magic = BLOCKSTORE_META_MAGIC_V1;
|
||||
new_meta_hdr->version = new_meta_format == 0 ? BLOCKSTORE_META_FORMAT_HEAP : new_meta_format;
|
||||
new_meta_hdr->meta_block_size = dsk.meta_block_size;
|
||||
new_meta_hdr->data_block_size = dsk.data_block_size;
|
||||
new_meta_hdr->bitmap_granularity = dsk.bitmap_granularity ? dsk.bitmap_granularity : 4096;
|
||||
new_meta_hdr->data_csum_type = dsk.data_csum_type;
|
||||
new_meta_hdr->csum_block_size = dsk.csum_block_size;
|
||||
new_meta_hdr->compacted_lsn = hdr->compacted_lsn;
|
||||
new_meta_hdr->header_csum = 0;
|
||||
new_meta_hdr->header_csum = crc32c(0, new_meta_hdr, new_meta_hdr->version == BLOCKSTORE_META_FORMAT_HEAP
|
||||
? sizeof(blockstore_meta_header_v3_t) : sizeof(blockstore_meta_header_v2_t));
|
||||
if (hdr->version == BLOCKSTORE_META_FORMAT_HEAP && new_meta_format != BLOCKSTORE_META_FORMAT_HEAP)
|
||||
{
|
||||
build_journal_start();
|
||||
}
|
||||
},
|
||||
[this](uint64_t block_num, clean_disk_entry *entry, uint8_t *bitmap)
|
||||
[&](blockstore_heap_t *heap, heap_object_t *obj, uint32_t meta_block_num)
|
||||
{
|
||||
for (auto wr = obj->get_writes(); wr; wr = wr->next())
|
||||
{
|
||||
if ((wr->entry_type & BS_HEAP_TYPE) == BS_HEAP_BIG_WRITE)
|
||||
{
|
||||
uint64_t block_num = wr->big().block_num;
|
||||
auto remap_it = data_remap.find(block_num);
|
||||
if (remap_it != data_remap.end())
|
||||
block_num = remap_it->second;
|
||||
if (block_num < free_first || block_num >= total_blocks-free_last)
|
||||
{
|
||||
fprintf(stderr, "BUG: remapped block %ju not in range %ju..%ju\n", block_num, free_first, total_blocks-free_last);
|
||||
exit(1);
|
||||
}
|
||||
block_num += data_idx_diff;
|
||||
wr->big().block_num = block_num;
|
||||
}
|
||||
else if ((wr->entry_type & BS_HEAP_TYPE) == BS_HEAP_SMALL_WRITE)
|
||||
{
|
||||
if (new_heap && wr->small().len > 0)
|
||||
{
|
||||
if (new_journal_ptr-new_journal_buf+wr->small().len > new_journal_len)
|
||||
{
|
||||
fprintf(stderr, "Small write data doesn't fit into the new buffer area\n");
|
||||
exit(1);
|
||||
}
|
||||
memcpy(new_journal_ptr, buffer_area+wr->small().location, wr->small().len);
|
||||
wr->small().location = new_journal_ptr-new_journal_buf;
|
||||
new_journal_ptr += wr->small().len;
|
||||
}
|
||||
}
|
||||
else if (!new_heap)
|
||||
{
|
||||
fprintf(stderr, "Object %jx:%jx can't be converted to the old format because it contains %s\n",
|
||||
obj->inode, obj->stripe, (wr->entry_type & BS_HEAP_TYPE) == BS_HEAP_TOMBSTONE
|
||||
? "a tombstone" : ((wr->entry_type & BS_HEAP_TYPE) == BS_HEAP_INTENT_WRITE ? "an intent_write entry" : "an unknown entry"));
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
if (new_heap)
|
||||
{
|
||||
// New -> New
|
||||
new_heap->copy_object(obj, NULL);
|
||||
}
|
||||
else
|
||||
{
|
||||
// Fill journal
|
||||
writes.clear();
|
||||
for (auto wr = obj->get_writes(); wr; wr = wr->next())
|
||||
{
|
||||
writes.push_back(wr);
|
||||
}
|
||||
for (ssize_t i = writes.size()-2; i >= 0; i--)
|
||||
{
|
||||
auto wr = writes[i];
|
||||
assert((wr->entry_type & BS_HEAP_TYPE) == BS_HEAP_SMALL_WRITE || wr->entry_type == BS_HEAP_BIG_WRITE);
|
||||
uint32_t je_size = ((wr->entry_type & BS_HEAP_TYPE) == BS_HEAP_SMALL_WRITE
|
||||
? sizeof(journal_entry_small_write) + dsk.dirty_dyn_size(wr->small().offset, wr->small().len)
|
||||
: sizeof(journal_entry_big_write) + dsk.dirty_dyn_size(0, dsk.data_block_size));
|
||||
choose_journal_block(je_size);
|
||||
journal_entry *je = (journal_entry*)(new_journal_ptr + new_journal_in_pos);
|
||||
je->magic = JOURNAL_MAGIC;
|
||||
je->type = (wr->entry_type & BS_HEAP_STABLE) ? JE_SMALL_WRITE_INSTANT : JE_SMALL_WRITE;
|
||||
je->size = je_size;
|
||||
je->crc32_prev = new_crc32_prev;
|
||||
je->small_write.oid = (object_id){ .inode = obj->inode, .stripe = obj->stripe };
|
||||
je->small_write.version = wr->version;
|
||||
if (wr->type() == BS_HEAP_SMALL_WRITE)
|
||||
{
|
||||
je->small_write.offset = wr->small().offset;
|
||||
je->small_write.len = wr->small().len;
|
||||
je->small_write.data_offset = new_journal_data-new_journal_buf;
|
||||
if (je->small_write.data_offset + je->small_write.len > new_journal_len)
|
||||
{
|
||||
fprintf(stderr, "Error: live entries don't fit to the new journal\n");
|
||||
exit(1);
|
||||
}
|
||||
memcpy(new_journal_data, buffer_area+wr->small().location, je->small_write.len);
|
||||
new_journal_data += je->small_write.len;
|
||||
if (dsk.data_csum_type == 0 && wr->get_checksum(heap))
|
||||
je->small_write.crc32_data = *wr->get_checksum(heap);
|
||||
}
|
||||
else
|
||||
{
|
||||
je->big_write.location = wr->big_location(heap);
|
||||
}
|
||||
memcpy((uint8_t*)je + je->size, wr->get_ext_bitmap(heap), new_clean_entry_bitmap_size);
|
||||
if (dsk.data_csum_type != 0 && wr->get_checksums(heap))
|
||||
{
|
||||
memcpy((uint8_t*)je + je->size + new_clean_entry_bitmap_size, wr->get_checksums(heap), wr->get_csum_size(heap));
|
||||
}
|
||||
je->crc32 = je_crc32(je);
|
||||
new_journal_in_pos += je->size;
|
||||
new_crc32_prev = je->crc32;
|
||||
}
|
||||
// New -> Old
|
||||
if (writes[writes.size()-1]->entry_type == BS_HEAP_BIG_WRITE|BS_HEAP_STABLE)
|
||||
{
|
||||
auto big_wr = writes[writes.size()-1];
|
||||
uint64_t block_num = big_wr->big().block_num;
|
||||
clean_disk_entry *new_entry = (clean_disk_entry*)(new_meta_buf + dsk.meta_block_size +
|
||||
dsk.meta_block_size*(block_num / new_entries_per_block) +
|
||||
new_clean_entry_size*(block_num % new_entries_per_block));
|
||||
new_entry->oid = (object_id){ .inode = obj->inode, .stripe = obj->stripe };
|
||||
new_entry->version = big_wr->version;
|
||||
memcpy(new_entry->bitmap, big_wr->get_ext_bitmap(heap), new_clean_entry_bitmap_size);
|
||||
memcpy(new_entry->bitmap + new_clean_entry_bitmap_size, big_wr->get_int_bitmap(heap), new_clean_entry_bitmap_size);
|
||||
memcpy(new_entry->bitmap + 2*new_clean_entry_bitmap_size, big_wr->get_checksums(heap), new_data_csum_size);
|
||||
uint32_t *new_entry_csum = (uint32_t*)(((uint8_t*)new_entry) + new_clean_entry_size - 4);
|
||||
*new_entry_csum = crc32c(0, new_entry, new_clean_entry_size - 4);
|
||||
}
|
||||
}
|
||||
},
|
||||
[&](uint64_t block_num, clean_disk_entry *entry, uint8_t *bitmap)
|
||||
{
|
||||
auto remap_it = data_remap.find(block_num);
|
||||
if (remap_it != data_remap.end())
|
||||
@@ -497,26 +687,42 @@ int disk_tool_t::resize_rewrite_meta()
|
||||
exit(1);
|
||||
}
|
||||
block_num += data_idx_diff;
|
||||
clean_disk_entry *new_entry = (clean_disk_entry*)(new_meta_buf + dsk.meta_block_size +
|
||||
dsk.meta_block_size*(block_num / new_entries_per_block) +
|
||||
new_clean_entry_size*(block_num % new_entries_per_block));
|
||||
new_entry->oid = entry->oid;
|
||||
new_entry->version = entry->version;
|
||||
if (bitmap)
|
||||
memcpy(new_entry->bitmap, bitmap, 2*new_clean_entry_bitmap_size + new_data_csum_size);
|
||||
if (new_heap)
|
||||
{
|
||||
// Old -> New
|
||||
uint8_t wr_buf[new_heap->get_max_write_entry_size()];
|
||||
heap_write_t *wr = (heap_write_t*)wr_buf;
|
||||
wr->entry_type = BS_HEAP_BIG_WRITE|BS_HEAP_STABLE;
|
||||
wr->big().block_num = block_num;
|
||||
wr->next_pos = 0;
|
||||
wr->size = wr->get_size(new_heap);
|
||||
if (bitmap)
|
||||
{
|
||||
memcpy(wr->get_ext_bitmap(new_heap), bitmap, new_clean_entry_bitmap_size);
|
||||
memcpy(wr->get_int_bitmap(new_heap), bitmap+new_clean_entry_bitmap_size, new_clean_entry_bitmap_size);
|
||||
memcpy(wr->get_checksums(new_heap), bitmap+2*new_clean_entry_bitmap_size, new_data_csum_size);
|
||||
}
|
||||
new_heap->post_write(entry->oid, wr, NULL, NULL);
|
||||
}
|
||||
else
|
||||
memset(new_entry->bitmap, 0xff, 2*new_clean_entry_bitmap_size);
|
||||
uint32_t *new_entry_csum = (uint32_t*)(((uint8_t*)new_entry) + new_clean_entry_size - 4);
|
||||
*new_entry_csum = crc32c(0, new_entry, new_clean_entry_size - 4);
|
||||
}
|
||||
{
|
||||
// Old -> Old
|
||||
clean_disk_entry *new_entry = (clean_disk_entry*)(new_meta_buf + dsk.meta_block_size +
|
||||
dsk.meta_block_size*(block_num / new_entries_per_block) +
|
||||
new_clean_entry_size*(block_num % new_entries_per_block));
|
||||
new_entry->oid = entry->oid;
|
||||
new_entry->version = entry->version;
|
||||
if (bitmap)
|
||||
memcpy(new_entry->bitmap, bitmap, 2*new_clean_entry_bitmap_size + new_data_csum_size);
|
||||
else
|
||||
memset(new_entry->bitmap, 0xff, 2*new_clean_entry_bitmap_size);
|
||||
uint32_t *new_entry_csum = (uint32_t*)(((uint8_t*)new_entry) + new_clean_entry_size - 4);
|
||||
*new_entry_csum = crc32c(0, new_entry, new_clean_entry_size - 4);
|
||||
}
|
||||
},
|
||||
true, true
|
||||
);
|
||||
if (r != 0)
|
||||
{
|
||||
free(new_meta_buf);
|
||||
new_meta_buf = NULL;
|
||||
return r;
|
||||
}
|
||||
return 0;
|
||||
return r;
|
||||
}
|
||||
|
||||
int disk_tool_t::resize_write_new_meta()
|
||||
@@ -528,11 +734,60 @@ int disk_tool_t::resize_write_new_meta()
|
||||
return 1;
|
||||
}
|
||||
lseek64(new_meta_fd, new_meta_offset, 0);
|
||||
write_blocking(new_meta_fd, new_meta_buf, new_meta_len);
|
||||
if (new_meta_buf)
|
||||
{
|
||||
write_blocking(new_meta_fd, new_meta_buf, new_meta_len);
|
||||
}
|
||||
else
|
||||
{
|
||||
assert(new_heap);
|
||||
uint32_t new_meta_blocks = new_meta_len / dsk.meta_block_size - 1;
|
||||
uint8_t *zero_block = (uint8_t*)memalign_or_die(MEM_ALIGNMENT, dsk.meta_block_size);
|
||||
memset(zero_block, 0, dsk.meta_block_size);
|
||||
std::vector<iovec> iov;
|
||||
iov.reserve(IOV_MAX);
|
||||
iov.push_back((iovec){ .iov_base = new_meta_hdr, .iov_len = dsk.meta_block_size });
|
||||
for (uint32_t i = 0; i < new_meta_blocks; i++)
|
||||
{
|
||||
uint8_t *data = new_heap->get_meta_block(i);
|
||||
iov.push_back((iovec){ .iov_base = data ? data : zero_block, .iov_len = dsk.meta_block_size });
|
||||
if (iov.size() >= IOV_MAX)
|
||||
{
|
||||
writev_blocking(new_meta_fd, iov.data(), iov.size());
|
||||
iov.clear();
|
||||
}
|
||||
}
|
||||
if (iov.size() > 0)
|
||||
writev_blocking(new_meta_fd, iov.data(), iov.size());
|
||||
free(zero_block);
|
||||
zero_block = NULL;
|
||||
}
|
||||
fsync(new_meta_fd);
|
||||
close(new_meta_fd);
|
||||
new_meta_fd = -1;
|
||||
free(new_meta_buf);
|
||||
new_meta_buf = NULL;
|
||||
return 0;
|
||||
}
|
||||
|
||||
void disk_tool_t::free_new_meta()
|
||||
{
|
||||
if (new_heap)
|
||||
{
|
||||
delete new_heap;
|
||||
new_heap = NULL;
|
||||
}
|
||||
if ((uint8_t*)new_meta_hdr != new_meta_buf)
|
||||
{
|
||||
free(new_meta_hdr);
|
||||
new_meta_hdr = NULL;
|
||||
}
|
||||
if (new_meta_buf)
|
||||
{
|
||||
free(new_meta_buf);
|
||||
new_meta_buf = NULL;
|
||||
}
|
||||
if (new_journal_buf)
|
||||
{
|
||||
free(new_journal_buf);
|
||||
new_journal_buf = NULL;
|
||||
}
|
||||
}
|
||||
|
@@ -29,7 +29,7 @@ int disk_tool_t::resize_data(std::string device)
|
||||
dsk.open_data();
|
||||
dsk.open_meta();
|
||||
dsk.open_journal();
|
||||
dsk.calc_lengths(true);
|
||||
dsk.calc_lengths();
|
||||
}
|
||||
catch (std::exception & e)
|
||||
{
|
||||
@@ -61,7 +61,7 @@ int disk_tool_t::resize_data(std::string device)
|
||||
dsk.journal_fd = old_journal_fd;
|
||||
dsk.meta_fd = old_meta_fd;
|
||||
dsk.data_fd = old_data_fd;
|
||||
dsk.calc_lengths(true);
|
||||
dsk.calc_lengths();
|
||||
dsk.journal_fd = -1;
|
||||
dsk.meta_fd = -1;
|
||||
dsk.data_fd = -1;
|
||||
@@ -82,8 +82,10 @@ int disk_tool_t::resize_data(std::string device)
|
||||
auto new_meta_device = move_options.find("new_meta_device") != move_options.end()
|
||||
? move_options["new_meta_device"] : dsk.meta_device;
|
||||
// Calculate new data & meta offsets
|
||||
if (!new_meta_len)
|
||||
new_meta_len = (dsk.meta_format == BLOCKSTORE_META_FORMAT_HEAP ? dsk.min_meta_len*2 : dsk.min_meta_len);
|
||||
new_data_offset = 4096 + (new_journal_device == dsk.data_device ? new_journal_len : 0) +
|
||||
(new_meta_device == dsk.data_device ? dsk.meta_len : 0);
|
||||
(new_meta_device == dsk.data_device ? new_meta_len : 0);
|
||||
new_data_offset += ((dsk.data_offset-new_data_offset) % dsk.data_block_size);
|
||||
if (new_data_offset != dsk.data_offset)
|
||||
move_options["new_data_offset"] = std::to_string(new_data_offset);
|
||||
@@ -236,7 +238,7 @@ int disk_tool_t::resize_parse_move_meta(std::map<std::string, std::string> & mov
|
||||
auto new_journal_device = move_options.find("new_journal_device") != move_options.end()
|
||||
? move_options["new_journal_device"] : dsk.journal_device;
|
||||
move_options["new_meta_device"] = dsk.data_device;
|
||||
move_options["new_meta_len"] = std::to_string(dsk.meta_len);
|
||||
move_options["new_meta_len"] = std::to_string(new_meta_len);
|
||||
}
|
||||
else
|
||||
{
|
||||
@@ -246,7 +248,6 @@ int disk_tool_t::resize_parse_move_meta(std::map<std::string, std::string> & mov
|
||||
std::string parent_dev = get_parent_device(real_dev);
|
||||
if (parent_dev == "")
|
||||
return 1;
|
||||
uint64_t new_meta_len = 0;
|
||||
if (parent_dev == real_dev)
|
||||
{
|
||||
// whole disk - create partition
|
||||
@@ -260,7 +261,7 @@ int disk_tool_t::resize_parse_move_meta(std::map<std::string, std::string> & mov
|
||||
fprintf(stderr, "metadata is already on a partition of %s\n", options["move_meta"].c_str());
|
||||
return 0;
|
||||
}
|
||||
new_meta_len = ((dsk.meta_len+1024*1024-1)/1024/1024)*1024*1024;
|
||||
new_meta_len = ((dsk.meta_area_size+1024*1024-1)/1024/1024)*1024*1024;
|
||||
if (!dry_run)
|
||||
{
|
||||
auto devinfos = collect_devices({ real_dev });
|
||||
|
@@ -107,7 +107,7 @@ int disk_tool_t::upgrade_simple_unit(std::string unit)
|
||||
dsk.open_data();
|
||||
dsk.open_meta();
|
||||
dsk.open_journal();
|
||||
dsk.calc_lengths(true);
|
||||
dsk.calc_lengths();
|
||||
dsk.close_all();
|
||||
}
|
||||
catch (std::exception & e)
|
||||
@@ -116,9 +116,8 @@ int disk_tool_t::upgrade_simple_unit(std::string unit)
|
||||
fprintf(stderr, "Error: %s\n", e.what());
|
||||
return 1;
|
||||
}
|
||||
options.erase("meta_format");
|
||||
if (m_is_d && m_o < d_o && d_o-m_o < dsk.meta_len)
|
||||
d_o += ((dsk.meta_len - (d_o-m_o) + blk-1) / blk) * blk;
|
||||
if (m_is_d && m_o < d_o && d_o-m_o < dsk.min_meta_len)
|
||||
d_o += ((dsk.min_meta_len - (d_o-m_o) + blk-1) / blk) * blk;
|
||||
}
|
||||
resize["new_data_offset"] = d_o;
|
||||
resize["new_meta_offset"] = m_o;
|
||||
|
@@ -292,7 +292,7 @@ void osd_t::parse_config(bool init)
|
||||
scrub_sleep_ms = config["scrub_sleep"].uint64_value();
|
||||
scrub_list_limit = config["scrub_list_limit"].uint64_value();
|
||||
if (!scrub_list_limit)
|
||||
scrub_list_limit = 1000;
|
||||
scrub_list_limit = 262144;
|
||||
if (!old_auto_scrub && auto_scrub)
|
||||
{
|
||||
// Schedule scrubbing
|
||||
|
@@ -98,6 +98,12 @@ struct osd_pg_lock_t
|
||||
uint64_t state = 0;
|
||||
};
|
||||
|
||||
struct osd_unstable_wr_t
|
||||
{
|
||||
uint64_t latest_ver = 0;
|
||||
uint64_t ver_count = 0;
|
||||
};
|
||||
|
||||
class osd_t
|
||||
{
|
||||
// config
|
||||
@@ -123,6 +129,7 @@ class osd_t
|
||||
int slow_log_interval = 10;
|
||||
int immediate_commit = IMMEDIATE_NONE;
|
||||
int autosync_interval = DEFAULT_AUTOSYNC_INTERVAL; // "emergency" sync every 5 seconds
|
||||
int autosync_dirty_per_object = 16;
|
||||
int autosync_writes = DEFAULT_AUTOSYNC_WRITES;
|
||||
uint64_t recovery_queue_depth = 1;
|
||||
uint64_t recovery_sleep_us = 0;
|
||||
@@ -142,7 +149,7 @@ class osd_t
|
||||
uint64_t global_scrub_interval = 30*86400;
|
||||
uint64_t scrub_queue_depth = 1;
|
||||
uint64_t scrub_sleep_ms = 0;
|
||||
uint32_t scrub_list_limit = 1000;
|
||||
uint32_t scrub_list_limit = 262144;
|
||||
bool scrub_find_best = true;
|
||||
uint64_t scrub_ec_max_bruteforce = 100;
|
||||
bool enable_pg_locks = false;
|
||||
@@ -195,7 +202,8 @@ class osd_t
|
||||
|
||||
// Unstable writes
|
||||
uint64_t unstable_write_count = 0;
|
||||
std::map<osd_object_id_t, uint64_t> unstable_writes;
|
||||
uint64_t unstable_per_object = 0;
|
||||
std::map<osd_object_id_t, osd_unstable_wr_t> unstable_writes;
|
||||
std::deque<osd_op_t*> syncs_in_progress;
|
||||
|
||||
// client & peer I/O
|
||||
@@ -265,7 +273,6 @@ class osd_t
|
||||
void report_statistics();
|
||||
void report_pg_state(pg_t & pg);
|
||||
void report_pg_states();
|
||||
void apply_no_inode_stats();
|
||||
void apply_pg_count();
|
||||
void apply_pg_config();
|
||||
|
||||
|
@@ -274,19 +274,27 @@ void osd_t::report_statistics()
|
||||
json11::Json::object last_stat;
|
||||
pool_id_t last_pool = 0;
|
||||
std::map<uint64_t, uint64_t> bs_empty_space;
|
||||
auto & bs_inode_space = bs ? bs->get_inode_space_stats() : bs_empty_space;
|
||||
for (auto kv: bs_inode_space)
|
||||
const auto & bs_inode_space = bs ? bs->get_inode_space_stats() : bs_empty_space;
|
||||
for (auto it = bs_inode_space.begin(); it != bs_inode_space.end(); )
|
||||
{
|
||||
pool_id_t pool_id = INODE_POOL(kv.first);
|
||||
uint64_t only_inode_num = INODE_NO_POOL(kv.first);
|
||||
pool_id_t pool_id = INODE_POOL(it->first);
|
||||
uint64_t only_inode_num = INODE_NO_POOL(it->first);
|
||||
if (!last_pool || pool_id != last_pool)
|
||||
{
|
||||
auto pool_it = st_cli.pool_config.find(pool_id);
|
||||
if (pool_it != st_cli.pool_config.end() && !pool_it->second.used_for_app.empty())
|
||||
{
|
||||
// Skip pool
|
||||
it = bs_inode_space.lower_bound(INODE_WITH_POOL(pool_id+1, 0));
|
||||
continue;
|
||||
}
|
||||
if (last_pool)
|
||||
inode_space[std::to_string(last_pool)] = last_stat;
|
||||
last_stat = json11::Json::object();
|
||||
last_pool = pool_id;
|
||||
}
|
||||
last_stat[std::to_string(only_inode_num)] = kv.second;
|
||||
last_stat[std::to_string(only_inode_num)] = it->second;
|
||||
it++;
|
||||
}
|
||||
if (last_pool)
|
||||
inode_space[std::to_string(last_pool)] = last_stat;
|
||||
@@ -461,10 +469,6 @@ void osd_t::on_change_etcd_state_hook(std::map<std::string, etcd_kv_t> & changes
|
||||
parse_config(false);
|
||||
}
|
||||
bool pools = changes.find(st_cli.etcd_prefix+"/config/pools") != changes.end();
|
||||
if (pools)
|
||||
{
|
||||
apply_no_inode_stats();
|
||||
}
|
||||
if (run_primary)
|
||||
{
|
||||
bool pgs = changes.find(st_cli.etcd_prefix+"/pg/config") != changes.end();
|
||||
@@ -495,8 +499,6 @@ void osd_t::on_reload_config_hook(json11::Json::object & global_config)
|
||||
// Acquire lease
|
||||
void osd_t::acquire_lease()
|
||||
{
|
||||
// Apply no_inode_stats before the first statistics report
|
||||
apply_no_inode_stats();
|
||||
// Maximum lease TTL is (report interval) + retries * (timeout + repeat interval)
|
||||
st_cli.etcd_call("/lease/grant", json11::Json::object {
|
||||
{ "TTL", etcd_report_interval+(st_cli.max_etcd_attempts*(2*st_cli.etcd_quick_timeout)+999)/1000 }
|
||||
@@ -685,7 +687,6 @@ void osd_t::on_load_pgs_hook(bool success)
|
||||
else
|
||||
{
|
||||
peering_state &= ~OSD_LOADING_PGS;
|
||||
apply_no_inode_stats();
|
||||
if (run_primary)
|
||||
{
|
||||
apply_pg_count();
|
||||
@@ -694,23 +695,6 @@ void osd_t::on_load_pgs_hook(bool success)
|
||||
}
|
||||
}
|
||||
|
||||
void osd_t::apply_no_inode_stats()
|
||||
{
|
||||
if (!bs)
|
||||
{
|
||||
return;
|
||||
}
|
||||
std::vector<uint64_t> no_inode_stats;
|
||||
for (auto & pool_item: st_cli.pool_config)
|
||||
{
|
||||
if (!pool_item.second.used_for_app.empty())
|
||||
{
|
||||
no_inode_stats.push_back(pool_item.first);
|
||||
}
|
||||
}
|
||||
bs->set_no_inode_stats(no_inode_stats);
|
||||
}
|
||||
|
||||
void osd_t::apply_pg_count()
|
||||
{
|
||||
for (auto & pool_item: st_cli.pool_config)
|
||||
|
@@ -5,18 +5,6 @@
|
||||
|
||||
#include "object_id.h"
|
||||
|
||||
#define POOL_SCHEME_REPLICATED 1
|
||||
#define POOL_SCHEME_XOR 2
|
||||
#define POOL_SCHEME_EC 3
|
||||
#define POOL_ID_MAX 0x10000
|
||||
#define POOL_ID_BITS 16
|
||||
#define INODE_POOL(inode) (pool_id_t)((inode) >> (64 - POOL_ID_BITS))
|
||||
#define INODE_NO_POOL(inode) (inode_t)((inode) & (((uint64_t)1 << (64-POOL_ID_BITS)) - 1))
|
||||
#define INODE_WITH_POOL(pool_id, inode) (((inode_t)(pool_id) << (64-POOL_ID_BITS)) | INODE_NO_POOL(inode))
|
||||
|
||||
// Pool ID is 16 bits long
|
||||
typedef uint32_t pool_id_t;
|
||||
|
||||
typedef uint64_t osd_num_t;
|
||||
typedef uint32_t pg_num_t;
|
||||
|
||||
|
@@ -128,6 +128,8 @@ void pg_obj_state_check_t::handle_version()
|
||||
n_copies++;
|
||||
if (replicated && replica > 0 || replica >= pg->pg_size)
|
||||
{
|
||||
printf("Object %jx:%jx has invalid chunk number: %u > %u\n", list[list_pos].oid.inode,
|
||||
list[list_pos].oid.stripe, replica, replicated ? 0 : pg->pg_size);
|
||||
n_invalid++;
|
||||
}
|
||||
else
|
||||
|
@@ -790,9 +790,9 @@ resume_5:
|
||||
if (immediate_commit == IMMEDIATE_NONE)
|
||||
{
|
||||
unstable_write_count++;
|
||||
if (unstable_write_count >= autosync_writes)
|
||||
if (unstable_write_count >= autosync_writes ||
|
||||
unstable_per_object >= autosync_dirty_per_object)
|
||||
{
|
||||
unstable_write_count = 0;
|
||||
autosync();
|
||||
}
|
||||
}
|
||||
|
@@ -9,7 +9,6 @@
|
||||
#define SUBMIT_READ 0
|
||||
#define SUBMIT_RMW_READ 1
|
||||
#define SUBMIT_WRITE 2
|
||||
#define SUBMIT_SCRUB_READ 3
|
||||
|
||||
struct unstable_osd_num_t
|
||||
{
|
||||
@@ -44,6 +43,7 @@ struct osd_primary_op_data_t
|
||||
osd_num_t *dirty_osds;
|
||||
int dirty_osd_count;
|
||||
obj_ver_id *unstable_writes;
|
||||
uint64_t *unstable_ver_counts;
|
||||
obj_ver_osd_t *copies_to_delete;
|
||||
int copies_to_delete_count;
|
||||
};
|
||||
|
@@ -130,7 +130,7 @@ void osd_t::submit_primary_subops(int submit_type, uint64_t op_version, const ui
|
||||
if (osd_set[role] != 0 && (wr || !rep && stripes[role].read_end != 0))
|
||||
n_subops++;
|
||||
}
|
||||
if (!n_subops && (submit_type == SUBMIT_RMW_READ || rep))
|
||||
if (zero_read >= 0 && !n_subops && (submit_type == SUBMIT_RMW_READ || rep))
|
||||
n_subops = 1;
|
||||
else
|
||||
zero_read = -1;
|
||||
@@ -153,13 +153,13 @@ int osd_t::submit_primary_subop_batch(int submit_type, inode_t inode, uint64_t o
|
||||
for (int role = 0; role < (op_data->pg ? op_data->pg->pg_size : 1); role++)
|
||||
{
|
||||
// We always submit zero-length writes to all replicas, even if the stripe is not modified
|
||||
if (!(wr || !rep && stripes[role].read_end != 0 || zero_read == role || submit_type == SUBMIT_SCRUB_READ))
|
||||
if (!(wr || !rep && stripes[role].read_end != 0 || zero_read == role))
|
||||
{
|
||||
continue;
|
||||
}
|
||||
osd_num_t role_osd_num = osd_set[role];
|
||||
int stripe_num = rep ? 0 : role;
|
||||
osd_rmw_stripe_t *si = stripes + (submit_type == SUBMIT_SCRUB_READ ? role : stripe_num);
|
||||
osd_rmw_stripe_t *si = stripes + stripe_num;
|
||||
if (role_osd_num != 0)
|
||||
{
|
||||
si->osd_num = role_osd_num;
|
||||
|
@@ -45,7 +45,10 @@ resume_2:
|
||||
if (unstable_writes.size() > 0)
|
||||
{
|
||||
op_data->unstable_write_osds = new std::vector<unstable_osd_num_t>();
|
||||
op_data->unstable_writes = new obj_ver_id[this->unstable_writes.size()];
|
||||
op_data->unstable_writes = (obj_ver_id*)malloc_or_die(
|
||||
(sizeof(obj_ver_id) + sizeof(uint64_t)) * this->unstable_writes.size());
|
||||
op_data->unstable_ver_counts = (uint64_t*)((uint8_t*)op_data->unstable_writes +
|
||||
sizeof(obj_ver_id) * this->unstable_writes.size());
|
||||
osd_num_t last_osd = 0;
|
||||
int last_start = 0, last_end = 0;
|
||||
for (auto it = this->unstable_writes.begin(); it != this->unstable_writes.end(); it++)
|
||||
@@ -65,8 +68,9 @@ resume_2:
|
||||
}
|
||||
op_data->unstable_writes[last_end] = (obj_ver_id){
|
||||
.oid = it->first.oid,
|
||||
.version = it->second,
|
||||
.version = it->second.latest_ver,
|
||||
};
|
||||
op_data->unstable_ver_counts[last_end] = it->second.ver_count;
|
||||
last_end++;
|
||||
}
|
||||
if (last_osd != 0)
|
||||
@@ -78,6 +82,8 @@ resume_2:
|
||||
});
|
||||
}
|
||||
this->unstable_writes.clear();
|
||||
this->unstable_write_count = 0;
|
||||
this->unstable_per_object = 0;
|
||||
}
|
||||
{
|
||||
op_data->dirty_pg_count = dirty_pgs.size();
|
||||
@@ -175,11 +181,12 @@ resume_6:
|
||||
};
|
||||
if (pgs.at(wpg).state & PG_ACTIVE)
|
||||
{
|
||||
uint64_t & dest = this->unstable_writes[(osd_object_id_t){
|
||||
auto & dest = this->unstable_writes[(osd_object_id_t){
|
||||
.osd_num = unstable_osd.osd_num,
|
||||
.oid = w.oid,
|
||||
}];
|
||||
dest = dest < w.version ? w.version : dest;
|
||||
dest.latest_ver = dest.latest_ver < w.version ? w.version : dest.latest_ver;
|
||||
dest.ver_count += op_data->unstable_ver_counts[unstable_osd.start + i];
|
||||
dirty_pgs.insert(wpg);
|
||||
}
|
||||
}
|
||||
@@ -236,7 +243,7 @@ resume_8:
|
||||
if (op_data->unstable_writes)
|
||||
{
|
||||
delete op_data->unstable_write_osds;
|
||||
delete[] op_data->unstable_writes;
|
||||
free(op_data->unstable_writes);
|
||||
op_data->unstable_writes = NULL;
|
||||
op_data->unstable_write_osds = NULL;
|
||||
}
|
||||
|
@@ -409,9 +409,9 @@ continue_others:
|
||||
}
|
||||
// finish_op would invalidate next_it if it cleared pg.write_queue, but it doesn't do that :)
|
||||
finish_op(cur_op, cur_op->reply.hdr.retval);
|
||||
if (unstable_write_count >= autosync_writes)
|
||||
if (unstable_write_count >= autosync_writes ||
|
||||
unstable_per_object >= autosync_dirty_per_object)
|
||||
{
|
||||
unstable_write_count = 0;
|
||||
autosync();
|
||||
}
|
||||
if (next_op)
|
||||
@@ -544,13 +544,17 @@ lazy:
|
||||
for (auto & chunk: loc_set)
|
||||
{
|
||||
this->dirty_osds.insert(chunk.osd_num);
|
||||
this->unstable_writes[(osd_object_id_t){
|
||||
auto & unst = this->unstable_writes[(osd_object_id_t){
|
||||
.osd_num = chunk.osd_num,
|
||||
.oid = {
|
||||
.inode = op_data->oid.inode,
|
||||
.stripe = op_data->oid.stripe | chunk.role,
|
||||
},
|
||||
}] = op_data->fact_ver;
|
||||
}];
|
||||
unst.latest_ver = op_data->fact_ver;
|
||||
unst.ver_count++;
|
||||
if (unstable_per_object < unst.ver_count)
|
||||
unstable_per_object = unst.ver_count;
|
||||
}
|
||||
}
|
||||
else
|
||||
|
@@ -34,6 +34,25 @@ add_executable(test_allocator EXCLUDE_FROM_ALL test_allocator.cpp ../util/alloca
|
||||
add_dependencies(build_tests test_allocator)
|
||||
add_test(NAME test_allocator COMMAND test_allocator)
|
||||
|
||||
# test_heap
|
||||
add_executable(test_heap
|
||||
EXCLUDE_FROM_ALL
|
||||
test_heap.cpp
|
||||
../blockstore/multilist.cpp
|
||||
../blockstore/blockstore_heap.cpp
|
||||
../util/crc32c.c
|
||||
../util/allocator.cpp
|
||||
../blockstore/blockstore_disk.cpp
|
||||
../util/str_util.cpp
|
||||
)
|
||||
target_link_libraries(test_heap
|
||||
${ISAL_LIBRARIES}
|
||||
)
|
||||
add_dependencies(build_tests test_heap)
|
||||
add_test(NAME test_heap COMMAND test_heap)
|
||||
target_compile_options(test_heap PRIVATE -coverage)
|
||||
target_link_options(test_heap PRIVATE -coverage)
|
||||
|
||||
# test_cas
|
||||
add_executable(test_cas
|
||||
test_cas.cpp
|
||||
@@ -47,11 +66,15 @@ add_executable(test_crc32
|
||||
test_crc32.cpp
|
||||
)
|
||||
target_link_libraries(test_crc32
|
||||
vitastor_blk
|
||||
vitastor_blk ${ISAL_LIBRARIES}
|
||||
)
|
||||
|
||||
## test_blockstore, test_shit
|
||||
#add_executable(test_blockstore test_blockstore.cpp)
|
||||
#target_link_libraries(test_blockstore blockstore)
|
||||
# test_blockstore
|
||||
add_executable(test_blockstore EXCLUDE_FROM_ALL test_blockstore.cpp ringloop_mock.cpp)
|
||||
add_dependencies(build_tests test_blockstore)
|
||||
target_link_libraries(test_blockstore vitastor_blk vitastor_common ${ISAL_LIBRARIES})
|
||||
add_test(NAME test_blockstore COMMAND test_blockstore)
|
||||
|
||||
## test_shit
|
||||
#add_executable(test_shit test_shit.cpp osd_peering_pg.cpp)
|
||||
#target_link_libraries(test_shit ${LIBURING_LIBRARIES} m)
|
||||
|
@@ -16,6 +16,7 @@
|
||||
|
||||
#include <stdexcept>
|
||||
|
||||
#include "malloc_or_die.h"
|
||||
#include "addr_util.h"
|
||||
#include "osd_ops.h"
|
||||
#include "rw_blocking.h"
|
||||
@@ -194,7 +195,7 @@ uint64_t test_read(int connect_fd, uint64_t inode, uint64_t stripe, uint64_t ver
|
||||
op.sec_rw.version = version;
|
||||
op.sec_rw.offset = offset;
|
||||
op.sec_rw.len = len;
|
||||
void *data = memalign(MEM_ALIGNMENT, op.sec_rw.len);
|
||||
void *data = memalign_or_die(MEM_ALIGNMENT, op.sec_rw.len);
|
||||
write_blocking(connect_fd, op.buf, OSD_PACKET_SIZE);
|
||||
int r = read_blocking(connect_fd, reply.buf, OSD_PACKET_SIZE);
|
||||
if (!check_reply(r, op, reply, op.sec_rw.len))
|
||||
@@ -221,7 +222,7 @@ uint64_t test_read(int connect_fd, uint64_t inode, uint64_t stripe, uint64_t ver
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
data = memalign(MEM_ALIGNMENT, sizeof(obj_ver_id)*reply.hdr.retval);
|
||||
data = memalign_or_die(MEM_ALIGNMENT, sizeof(obj_ver_id)*reply.hdr.retval);
|
||||
r = read_blocking(connect_fd, data, sizeof(obj_ver_id)*reply.hdr.retval);
|
||||
if (r != sizeof(obj_ver_id)*reply.hdr.retval)
|
||||
{
|
||||
@@ -254,7 +255,7 @@ uint64_t test_write(int connect_fd, uint64_t inode, uint64_t stripe, uint64_t ve
|
||||
op.sec_rw.version = version;
|
||||
op.sec_rw.offset = 0;
|
||||
op.sec_rw.len = 128*1024;
|
||||
void *data = memalign(MEM_ALIGNMENT, op.sec_rw.len);
|
||||
void *data = memalign_or_die(MEM_ALIGNMENT, op.sec_rw.len);
|
||||
for (int i = 0; i < (op.sec_rw.len)/sizeof(uint64_t); i++)
|
||||
((uint64_t*)data)[i] = pattern;
|
||||
write_blocking(connect_fd, op.buf, OSD_PACKET_SIZE);
|
||||
@@ -289,7 +290,7 @@ void* test_primary_read(int connect_fd, uint64_t inode, uint64_t offset, uint64_
|
||||
op.rw.inode = inode;
|
||||
op.rw.offset = offset;
|
||||
op.rw.len = len;
|
||||
void *data = memalign(MEM_ALIGNMENT, len);
|
||||
void *data = memalign_or_die(MEM_ALIGNMENT, len);
|
||||
write_blocking(connect_fd, op.buf, OSD_PACKET_SIZE);
|
||||
int r = read_blocking(connect_fd, reply.buf, OSD_PACKET_SIZE);
|
||||
if (!check_reply(r, op, reply, len))
|
||||
@@ -317,7 +318,7 @@ void test_primary_write(int connect_fd, uint64_t inode, uint64_t offset, uint64_
|
||||
op.rw.inode = inode;
|
||||
op.rw.offset = offset;
|
||||
op.rw.len = len;
|
||||
void *data = memalign(MEM_ALIGNMENT, len);
|
||||
void *data = memalign_or_die(MEM_ALIGNMENT, len);
|
||||
set_pattern(data, len, pattern);
|
||||
write_blocking(connect_fd, op.buf, OSD_PACKET_SIZE);
|
||||
write_blocking(connect_fd, data, len);
|
||||
@@ -363,7 +364,7 @@ void test_list_stab(int connect_fd)
|
||||
assert(check_reply(r, op, reply, -1));
|
||||
int total_count = reply.hdr.retval;
|
||||
int stable_count = reply.sec_list.stable_count;
|
||||
obj_ver_id *data = (obj_ver_id*)malloc(total_count * sizeof(obj_ver_id));
|
||||
obj_ver_id *data = (obj_ver_id*)malloc_or_die(total_count * sizeof(obj_ver_id));
|
||||
assert(data);
|
||||
assert(read_blocking(connect_fd, data, total_count * sizeof(obj_ver_id)) == (total_count * sizeof(obj_ver_id)));
|
||||
int last_start = stable_count;
|
||||
@@ -381,7 +382,7 @@ void test_list_stab(int connect_fd)
|
||||
last_start = i;
|
||||
}
|
||||
}
|
||||
obj_ver_id *data2 = (obj_ver_id*)malloc(sizeof(obj_ver_id) * 32);
|
||||
obj_ver_id *data2 = (obj_ver_id*)malloc_or_die(sizeof(obj_ver_id) * 32);
|
||||
assert(data2);
|
||||
free(data2);
|
||||
free(data);
|
||||
|
395
src/test/ringloop_mock.cpp
Normal file
395
src/test/ringloop_mock.cpp
Normal file
@@ -0,0 +1,395 @@
|
||||
// Copyright (c) Vitaliy Filippov, 2019+
|
||||
// License: VNPL-1.1 or GNU GPL-2.0+ (see README.md for details)
|
||||
|
||||
#include <random>
|
||||
|
||||
#include "ringloop_mock.h"
|
||||
#include "malloc_or_die.h"
|
||||
|
||||
ring_loop_mock_t::ring_loop_mock_t(int qd, std::function<void(io_uring_sqe *)> submit_cb)
|
||||
{
|
||||
this->submit_cb = std::move(submit_cb);
|
||||
sqes.resize(qd);
|
||||
ring_datas.resize(qd);
|
||||
free_ring_datas.reserve(qd);
|
||||
submit_ring_datas.reserve(qd);
|
||||
completed_ring_datas.reserve(qd);
|
||||
for (size_t i = 0; i < ring_datas.size(); i++)
|
||||
{
|
||||
free_ring_datas.push_back(ring_datas.data() + i);
|
||||
}
|
||||
in_loop = false;
|
||||
}
|
||||
|
||||
void ring_loop_mock_t::register_consumer(ring_consumer_t *consumer)
|
||||
{
|
||||
unregister_consumer(consumer);
|
||||
consumers.push_back(consumer);
|
||||
}
|
||||
|
||||
void ring_loop_mock_t::unregister_consumer(ring_consumer_t *consumer)
|
||||
{
|
||||
for (int i = 0; i < consumers.size(); i++)
|
||||
{
|
||||
if (consumers[i] == consumer)
|
||||
{
|
||||
consumers.erase(consumers.begin()+i, consumers.begin()+i+1);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void ring_loop_mock_t::wakeup()
|
||||
{
|
||||
loop_again = true;
|
||||
}
|
||||
|
||||
void ring_loop_mock_t::set_immediate(const std::function<void()> & cb)
|
||||
{
|
||||
immediate_queue.push_back(cb);
|
||||
wakeup();
|
||||
}
|
||||
|
||||
unsigned ring_loop_mock_t::space_left()
|
||||
{
|
||||
return free_ring_datas.size();
|
||||
}
|
||||
|
||||
bool ring_loop_mock_t::has_work()
|
||||
{
|
||||
return loop_again;
|
||||
}
|
||||
|
||||
bool ring_loop_mock_t::has_sendmsg_zc()
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
int ring_loop_mock_t::register_eventfd()
|
||||
{
|
||||
return -1;
|
||||
}
|
||||
|
||||
io_uring_sqe* ring_loop_mock_t::get_sqe()
|
||||
{
|
||||
if (free_ring_datas.size() == 0)
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
ring_data_t *d = free_ring_datas.back();
|
||||
free_ring_datas.pop_back();
|
||||
submit_ring_datas.push_back(d);
|
||||
io_uring_sqe *sqe = &sqes[d - ring_datas.data()];
|
||||
*sqe = { 0 };
|
||||
io_uring_sqe_set_data(sqe, d);
|
||||
return sqe;
|
||||
}
|
||||
|
||||
int ring_loop_mock_t::submit()
|
||||
{
|
||||
for (size_t i = 0; i < submit_ring_datas.size(); i++)
|
||||
{
|
||||
submit_cb(&sqes[submit_ring_datas[i] - ring_datas.data()]);
|
||||
}
|
||||
submit_ring_datas.clear();
|
||||
return 0;
|
||||
}
|
||||
|
||||
int ring_loop_mock_t::wait()
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
unsigned ring_loop_mock_t::save()
|
||||
{
|
||||
return submit_ring_datas.size();
|
||||
}
|
||||
|
||||
void ring_loop_mock_t::restore(unsigned sqe_tail)
|
||||
{
|
||||
while (submit_ring_datas.size() > sqe_tail)
|
||||
{
|
||||
free_ring_datas.push_back(submit_ring_datas.back());
|
||||
submit_ring_datas.pop_back();
|
||||
}
|
||||
}
|
||||
|
||||
void ring_loop_mock_t::loop()
|
||||
{
|
||||
if (in_loop)
|
||||
{
|
||||
return;
|
||||
}
|
||||
in_loop = true;
|
||||
submit();
|
||||
while (completed_ring_datas.size())
|
||||
{
|
||||
ring_data_t *d = completed_ring_datas.back();
|
||||
completed_ring_datas.pop_back();
|
||||
if (d->callback)
|
||||
{
|
||||
struct ring_data_t dl;
|
||||
dl.iov = d->iov;
|
||||
dl.res = d->res;
|
||||
dl.more = dl.prev = false;
|
||||
dl.callback.swap(d->callback);
|
||||
free_ring_datas.push_back(d);
|
||||
dl.callback(&dl);
|
||||
}
|
||||
else
|
||||
{
|
||||
fprintf(stderr, "Warning: empty callback in SQE\n");
|
||||
free_ring_datas.push_back(d);
|
||||
}
|
||||
}
|
||||
do
|
||||
{
|
||||
loop_again = false;
|
||||
for (int i = 0; i < consumers.size(); i++)
|
||||
{
|
||||
consumers[i]->loop();
|
||||
if (immediate_queue.size())
|
||||
{
|
||||
immediate_queue2.swap(immediate_queue);
|
||||
for (auto & cb: immediate_queue2)
|
||||
cb();
|
||||
immediate_queue2.clear();
|
||||
}
|
||||
}
|
||||
} while (loop_again);
|
||||
in_loop = false;
|
||||
}
|
||||
|
||||
void ring_loop_mock_t::mark_completed(ring_data_t *data)
|
||||
{
|
||||
completed_ring_datas.push_back(data);
|
||||
wakeup();
|
||||
}
|
||||
|
||||
disk_mock_t::disk_mock_t(size_t size, bool buffered)
|
||||
{
|
||||
this->size = size;
|
||||
this->data = (uint8_t*)malloc_or_die(size);
|
||||
this->buffered = buffered;
|
||||
}
|
||||
|
||||
disk_mock_t::~disk_mock_t()
|
||||
{
|
||||
discard_buffers(true, 0);
|
||||
free(data);
|
||||
}
|
||||
|
||||
void disk_mock_t::erase_buffers(uint64_t begin, uint64_t end)
|
||||
{
|
||||
for (auto it = buffers.upper_bound(begin); it != buffers.end(); )
|
||||
{
|
||||
const uint64_t bs = it->first - it->second.iov_len;
|
||||
const uint64_t be = it->first;
|
||||
if (bs >= end)
|
||||
{
|
||||
break;
|
||||
}
|
||||
if (bs >= begin && be <= end)
|
||||
{
|
||||
// Remove the whole buffer
|
||||
buffers.erase(it++);
|
||||
}
|
||||
else if (bs < begin && be > end)
|
||||
{
|
||||
// Cut beginning & end & stop
|
||||
uint8_t *ce = (uint8_t*)malloc_or_die(be-end);
|
||||
memcpy(ce, it->second.iov_base + (end-bs), be-end);
|
||||
uint8_t *cs = (uint8_t*)realloc(it->second.iov_base, begin-bs);
|
||||
if (!cs)
|
||||
throw std::bad_alloc();
|
||||
buffers[begin] = (iovec){ .iov_base = cs, .iov_len = begin-bs };
|
||||
buffers[be] = (iovec){ .iov_base = ce, .iov_len = be-end };
|
||||
break;
|
||||
}
|
||||
else if (bs < begin)
|
||||
{
|
||||
// Cut beginning
|
||||
uint8_t *cs = (uint8_t*)realloc(it->second.iov_base, begin-bs);
|
||||
if (!cs)
|
||||
throw std::bad_alloc();
|
||||
buffers[begin] = (iovec){ .iov_base = cs, .iov_len = begin-bs };
|
||||
buffers.erase(it++);
|
||||
}
|
||||
else
|
||||
{
|
||||
// Cut end & stop
|
||||
assert(be > end);
|
||||
uint8_t *ce = (uint8_t*)malloc_or_die(be-end);
|
||||
memcpy(ce, it->second.iov_base + (end-bs), be-end);
|
||||
buffers[be] = (iovec){ .iov_base = ce, .iov_len = be-end };
|
||||
buffers.erase(it);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void disk_mock_t::clear(size_t offset, size_t len)
|
||||
{
|
||||
if (offset < size)
|
||||
{
|
||||
memset(data+offset, 0, len < size-offset ? len : size-offset);
|
||||
}
|
||||
}
|
||||
|
||||
void disk_mock_t::discard_buffers(bool all, uint32_t seed)
|
||||
{
|
||||
if (trace)
|
||||
printf("disk: discard buffers all=%d seed=%u\n", all, seed);
|
||||
if (all)
|
||||
{
|
||||
for (auto & b: buffers)
|
||||
free(b.second.iov_base);
|
||||
buffers.clear();
|
||||
}
|
||||
else
|
||||
{
|
||||
std::mt19937 rnd(seed);
|
||||
for (auto it = buffers.begin(); it != buffers.end(); )
|
||||
{
|
||||
if (rnd() < 0x80000000)
|
||||
{
|
||||
free(it->second.iov_base);
|
||||
buffers.erase(it++);
|
||||
}
|
||||
else
|
||||
it++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
ssize_t disk_mock_t::copy_from_sqe(io_uring_sqe *sqe, uint8_t *to, uint64_t base_offset)
|
||||
{
|
||||
size_t off = sqe->off;
|
||||
iovec *v = (iovec*)sqe->addr;
|
||||
size_t n = sqe->len;
|
||||
for (size_t i = 0; i < n; i++)
|
||||
{
|
||||
if (off >= size)
|
||||
{
|
||||
off = sqe->off - EINVAL; // :D
|
||||
break;
|
||||
}
|
||||
size_t cur = (off + v[i].iov_len > size ? size-off : v[i].iov_len);
|
||||
if (trace)
|
||||
printf("disk: write %zu+%zu from %jx\n", off, cur, (uint64_t)v[i].iov_base);
|
||||
memcpy(to + off - base_offset, v[i].iov_base, cur);
|
||||
off += v[i].iov_len;
|
||||
}
|
||||
return off - sqe->off;
|
||||
}
|
||||
|
||||
void disk_mock_t::read_item(uint8_t *to, uint64_t offset, uint64_t len)
|
||||
{
|
||||
uint64_t last = offset;
|
||||
for (auto it = buffers.upper_bound(offset); it != buffers.end(); it++)
|
||||
{
|
||||
const uint64_t bs = it->first - it->second.iov_len;
|
||||
const uint64_t be = it->first;
|
||||
if (bs >= offset+len)
|
||||
{
|
||||
break;
|
||||
}
|
||||
if (last < bs)
|
||||
{
|
||||
// Fill the gap between buffers
|
||||
memcpy(to+last-offset, data+last, bs-last);
|
||||
last = bs;
|
||||
}
|
||||
if (last < offset)
|
||||
{
|
||||
last = offset;
|
||||
}
|
||||
uint64_t cur_end = be < offset+len ? be : offset+len;
|
||||
memcpy(to+last-offset, it->second.iov_base+last-bs, cur_end-last);
|
||||
last = be;
|
||||
}
|
||||
if (last < offset+len)
|
||||
{
|
||||
// Fill the gap in the end
|
||||
memcpy(to+last-offset, data+last, offset+len-last);
|
||||
}
|
||||
}
|
||||
|
||||
bool disk_mock_t::submit(io_uring_sqe *sqe)
|
||||
{
|
||||
ring_data_t *userdata = (ring_data_t*)sqe->user_data;
|
||||
if (sqe->opcode == IORING_OP_READV)
|
||||
{
|
||||
size_t off = sqe->off;
|
||||
iovec *v = (iovec*)sqe->addr;
|
||||
size_t n = sqe->len;
|
||||
for (size_t i = 0; i < n; i++)
|
||||
{
|
||||
if (off < size)
|
||||
{
|
||||
size_t cur = (off + v[i].iov_len > size ? size-off : v[i].iov_len);
|
||||
if (trace)
|
||||
printf("disk: read %zu+%zu to %jx\n", off, cur, (uint64_t)v[i].iov_base);
|
||||
if (buffers.size())
|
||||
read_item((uint8_t*)v[i].iov_base, off, cur);
|
||||
else
|
||||
memcpy(v[i].iov_base, data + off, cur);
|
||||
}
|
||||
off += v[i].iov_len;
|
||||
}
|
||||
userdata->res = off - sqe->off;
|
||||
}
|
||||
else if (sqe->opcode == IORING_OP_WRITEV)
|
||||
{
|
||||
uint64_t end = 0;
|
||||
if (buffered)
|
||||
{
|
||||
// Remove overwritten parts of buffers
|
||||
end = sqe->off;
|
||||
for (uint32_t i = 0; i < sqe->len; i++)
|
||||
{
|
||||
end += ((iovec*)sqe->addr)[i].iov_len;
|
||||
}
|
||||
erase_buffers(sqe->off, end);
|
||||
}
|
||||
if (!buffered || (sqe->rw_flags & RWF_DSYNC))
|
||||
{
|
||||
// Simple "immediate" mode
|
||||
userdata->res = copy_from_sqe(sqe, data, 0);
|
||||
}
|
||||
else
|
||||
{
|
||||
// Buffered mode
|
||||
uint8_t *buf = (uint8_t*)malloc_or_die(end - sqe->off);
|
||||
userdata->res = copy_from_sqe(sqe, buf, sqe->off);
|
||||
if (userdata->res == -EINVAL)
|
||||
free(buf);
|
||||
else
|
||||
buffers[end] = (iovec){ .iov_base = buf, .iov_len = end-sqe->off };
|
||||
}
|
||||
}
|
||||
else if (sqe->opcode == IORING_OP_FSYNC)
|
||||
{
|
||||
if (trace)
|
||||
printf("disk: fsync\n");
|
||||
if (buffers.size())
|
||||
{
|
||||
for (auto & b: buffers)
|
||||
{
|
||||
memcpy(data + b.first - b.second.iov_len, b.second.iov_base, b.second.iov_len);
|
||||
free(b.second.iov_base);
|
||||
}
|
||||
buffers.clear();
|
||||
}
|
||||
userdata->res = 0;
|
||||
}
|
||||
else
|
||||
{
|
||||
return false;
|
||||
}
|
||||
// Execution variability should also be introduced:
|
||||
// 1) reads submitted in parallel to writes (not after completing the write) should return old or new data randomly
|
||||
// 2) parallel operation completions should be delivered in random order
|
||||
// 3) when fsync is enabled, write cache should be sometimes lost during a simulated power outage
|
||||
return true;
|
||||
}
|
61
src/test/ringloop_mock.h
Normal file
61
src/test/ringloop_mock.h
Normal file
@@ -0,0 +1,61 @@
|
||||
// Copyright (c) Vitaliy Filippov, 2019+
|
||||
// License: VNPL-1.1 or GNU GPL-2.0+ (see README.md for details)
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "ringloop.h"
|
||||
|
||||
class ring_loop_mock_t: public ring_loop_i
|
||||
{
|
||||
std::vector<std::function<void()>> immediate_queue, immediate_queue2;
|
||||
std::vector<ring_consumer_t*> consumers;
|
||||
std::vector<io_uring_sqe> sqes;
|
||||
std::vector<ring_data_t> ring_datas;
|
||||
std::vector<ring_data_t *> free_ring_datas;
|
||||
std::vector<ring_data_t *> submit_ring_datas;
|
||||
std::vector<ring_data_t *> completed_ring_datas;
|
||||
std::function<void(io_uring_sqe *)> submit_cb;
|
||||
bool in_loop;
|
||||
bool loop_again;
|
||||
bool support_zc = false;
|
||||
|
||||
public:
|
||||
ring_loop_mock_t(int qd, std::function<void(io_uring_sqe *)> submit_cb);
|
||||
|
||||
void register_consumer(ring_consumer_t *consumer);
|
||||
void unregister_consumer(ring_consumer_t *consumer);
|
||||
void wakeup();
|
||||
void set_immediate(const std::function<void()> & cb);
|
||||
unsigned space_left();
|
||||
bool has_work();
|
||||
bool has_sendmsg_zc();
|
||||
|
||||
int register_eventfd();
|
||||
io_uring_sqe* get_sqe();
|
||||
int submit();
|
||||
int wait();
|
||||
void loop();
|
||||
unsigned save();
|
||||
void restore(unsigned sqe_tail);
|
||||
|
||||
void mark_completed(ring_data_t *data);
|
||||
};
|
||||
|
||||
class disk_mock_t
|
||||
{
|
||||
uint8_t *data = NULL;
|
||||
std::map<uint64_t, iovec> buffers;
|
||||
size_t size = 0;
|
||||
bool buffered = false;
|
||||
|
||||
void erase_buffers(uint64_t begin, uint64_t end);
|
||||
ssize_t copy_from_sqe(io_uring_sqe *sqe, uint8_t *to, uint64_t base_offset);
|
||||
void read_item(uint8_t *to, uint64_t offset, uint64_t len);
|
||||
public:
|
||||
bool trace = false;
|
||||
disk_mock_t(size_t size, bool buffered);
|
||||
~disk_mock_t();
|
||||
void clear(size_t offset, size_t len);
|
||||
void discard_buffers(bool all, uint32_t seed);
|
||||
bool submit(io_uring_sqe *sqe);
|
||||
};
|
@@ -2,127 +2,544 @@
|
||||
// License: VNPL-1.1 (see README.md for details)
|
||||
|
||||
#include <malloc.h>
|
||||
#include "blockstore.h"
|
||||
#include "epoll_manager.h"
|
||||
#include "str_util.h"
|
||||
#include "ringloop_mock.h"
|
||||
#include "blockstore_impl.h"
|
||||
|
||||
struct bs_test_t
|
||||
{
|
||||
blockstore_config_t config;
|
||||
disk_mock_t *data_disk = NULL;
|
||||
disk_mock_t *meta_disk = NULL;
|
||||
std::function<bool(io_uring_sqe*)> sqe_handler;
|
||||
ring_loop_mock_t *ringloop = NULL;
|
||||
timerfd_manager_t *tfd = NULL;
|
||||
blockstore_impl_t *bs = NULL;
|
||||
|
||||
~bs_test_t()
|
||||
{
|
||||
destroy();
|
||||
}
|
||||
|
||||
void destroy_bs()
|
||||
{
|
||||
if (bs)
|
||||
{
|
||||
delete bs;
|
||||
bs = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
void destroy()
|
||||
{
|
||||
while (bs && !bs->is_safe_to_stop())
|
||||
ringloop->loop();
|
||||
destroy_bs();
|
||||
if (tfd)
|
||||
{
|
||||
delete tfd;
|
||||
tfd = NULL;
|
||||
}
|
||||
if (meta_disk)
|
||||
{
|
||||
delete meta_disk;
|
||||
meta_disk = NULL;
|
||||
}
|
||||
if (data_disk)
|
||||
{
|
||||
delete data_disk;
|
||||
data_disk = NULL;
|
||||
}
|
||||
if (ringloop)
|
||||
{
|
||||
delete ringloop;
|
||||
ringloop = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
void default_cfg()
|
||||
{
|
||||
config["data_device"] = "./test_data.bin";
|
||||
config["data_device_size"] = "1073741824";
|
||||
config["data_device_sect"] = "4096";
|
||||
config["meta_offset"] = "0";
|
||||
config["journal_offset"] = "16777216";
|
||||
config["data_offset"] = "33554432";
|
||||
config["disable_data_fsync"] = "1";
|
||||
config["immediate_commit"] = "all";
|
||||
config["log_level"] = "10";
|
||||
config["data_csum_type"] = "crc32c";
|
||||
config["csum_block_size"] = "4096";
|
||||
}
|
||||
|
||||
void init()
|
||||
{
|
||||
if (!ringloop)
|
||||
{
|
||||
ringloop = new ring_loop_mock_t(RINGLOOP_DEFAULT_SIZE, [&](io_uring_sqe *sqe)
|
||||
{
|
||||
if (sqe_handler && sqe_handler(sqe))
|
||||
{
|
||||
}
|
||||
else if (sqe->fd == MOCK_DATA_FD)
|
||||
{
|
||||
bool ok = data_disk->submit(sqe);
|
||||
assert(ok);
|
||||
ringloop->mark_completed((ring_data_t*)sqe->user_data);
|
||||
}
|
||||
else if (sqe->fd == MOCK_META_FD)
|
||||
{
|
||||
bool ok = meta_disk->submit(sqe);
|
||||
assert(ok);
|
||||
ringloop->mark_completed((ring_data_t*)sqe->user_data);
|
||||
}
|
||||
else
|
||||
{
|
||||
assert(0);
|
||||
}
|
||||
});
|
||||
}
|
||||
if (!tfd)
|
||||
{
|
||||
tfd = new timerfd_manager_t(nullptr);
|
||||
}
|
||||
if (!data_disk)
|
||||
{
|
||||
data_disk = new disk_mock_t(parse_size(config["data_device_size"]), config["disable_data_fsync"] != "1");
|
||||
data_disk->clear(0, parse_size(config["data_offset"]));
|
||||
}
|
||||
uint64_t meta_size = parse_size(config["meta_device_size"]);
|
||||
if (meta_size && !meta_disk)
|
||||
{
|
||||
meta_disk = new disk_mock_t(meta_size, config["disable_meta_fsync"] != "1");
|
||||
meta_disk->clear(0, meta_size);
|
||||
}
|
||||
if (!bs)
|
||||
{
|
||||
bs = new blockstore_impl_t(config, ringloop, tfd, true);
|
||||
while (!bs->is_started())
|
||||
ringloop->loop();
|
||||
printf("blockstore initialized\n");
|
||||
}
|
||||
}
|
||||
|
||||
void exec_op(blockstore_op_t *op)
|
||||
{
|
||||
bool done = false;
|
||||
op->callback = [&](blockstore_op_t *op)
|
||||
{
|
||||
printf("op opcode=%lu completed retval=%d\n", op->opcode, op->retval);
|
||||
done = true;
|
||||
};
|
||||
bs->enqueue_op(op);
|
||||
while (!done)
|
||||
ringloop->loop();
|
||||
op->callback = nullptr;
|
||||
}
|
||||
};
|
||||
|
||||
static bool memcheck(uint8_t *buf, uint8_t byte, size_t len)
|
||||
{
|
||||
for (size_t i = 0; i < len; i++)
|
||||
if (buf[i] != byte)
|
||||
return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
static void test_simple()
|
||||
{
|
||||
printf("\n-- test_simple\n");
|
||||
|
||||
bs_test_t test;
|
||||
test.default_cfg();
|
||||
test.init();
|
||||
|
||||
// Write
|
||||
blockstore_op_t op;
|
||||
uint64_t version = 0;
|
||||
op.opcode = BS_OP_WRITE;
|
||||
op.oid = { .inode = 1, .stripe = 0 };
|
||||
op.version = 1;
|
||||
op.offset = 16384;
|
||||
op.len = 4096;
|
||||
op.buf = (uint8_t*)memalign_or_die(MEM_ALIGNMENT, 128*1024);
|
||||
memset(op.buf, 0xaa, 4096);
|
||||
test.exec_op(&op);
|
||||
assert(op.retval == op.len);
|
||||
|
||||
// Sync
|
||||
printf("version %ju written, syncing\n", op.version);
|
||||
version = op.version;
|
||||
op.opcode = BS_OP_SYNC;
|
||||
test.exec_op(&op);
|
||||
assert(op.retval == 0);
|
||||
|
||||
// Commit
|
||||
printf("commit version %ju\n", version);
|
||||
op.opcode = BS_OP_STABLE;
|
||||
op.len = 1;
|
||||
*((obj_ver_id*)op.buf) = {
|
||||
.oid = { .inode = 1, .stripe = 0 },
|
||||
.version = version,
|
||||
};
|
||||
test.exec_op(&op);
|
||||
assert(op.retval == 0);
|
||||
|
||||
// Read
|
||||
printf("reading 0-128K\n");
|
||||
op.opcode = BS_OP_READ;
|
||||
op.oid = { .inode = 1, .stripe = 0 };
|
||||
op.version = UINT64_MAX;
|
||||
op.offset = 0;
|
||||
op.len = 128*1024;
|
||||
test.exec_op(&op);
|
||||
assert(op.retval == op.len);
|
||||
assert(op.version == 1);
|
||||
uint8_t *cmp = (uint8_t*)memalign_or_die(MEM_ALIGNMENT, 128*1024);
|
||||
memset(cmp, 0, 128*1024);
|
||||
memset(cmp+16384, 0xaa, 4096);
|
||||
if (memcmp(op.buf, cmp, 128*1024) == 0)
|
||||
printf("read successful\n");
|
||||
else
|
||||
{
|
||||
printf("read returned incorrect data\n");
|
||||
abort();
|
||||
}
|
||||
|
||||
// Zero-length read
|
||||
printf("reading 0-0\n");
|
||||
op.version = UINT64_MAX;
|
||||
op.offset = 0;
|
||||
op.len = 0;
|
||||
test.exec_op(&op);
|
||||
assert(op.retval == op.len);
|
||||
assert(op.version == 1);
|
||||
|
||||
// Small read
|
||||
printf("reading 16K-24K\n");
|
||||
op.version = UINT64_MAX;
|
||||
op.offset = 16*1024;
|
||||
op.len = 8*1024;
|
||||
test.exec_op(&op);
|
||||
assert(op.retval == op.len);
|
||||
assert(!memcmp(op.buf, cmp+16*1024, 8*1024));
|
||||
|
||||
free(cmp);
|
||||
|
||||
free(op.buf);
|
||||
}
|
||||
|
||||
static void test_fsync(bool separate_meta)
|
||||
{
|
||||
printf("\n-- test_fsync%s\n", separate_meta ? " separate_meta" : "");
|
||||
|
||||
bs_test_t test;
|
||||
test.default_cfg();
|
||||
test.config["disable_data_fsync"] = "0";
|
||||
test.config["immediate_commit"] = "none";
|
||||
if (separate_meta)
|
||||
{
|
||||
test.config["meta_device"] = "./test_meta.bin";
|
||||
test.config["disable_meta_fsync"] = "1";
|
||||
test.config["meta_device_size"] = "33554432";
|
||||
test.config["meta_device_sect"] = "4096";
|
||||
test.config["data_offset"] = "0";
|
||||
}
|
||||
test.init();
|
||||
|
||||
// Write
|
||||
printf("writing\n");
|
||||
blockstore_op_t op;
|
||||
op.opcode = BS_OP_WRITE;
|
||||
op.oid = { .inode = 1, .stripe = 0 };
|
||||
op.version = 1;
|
||||
op.offset = 16384;
|
||||
op.len = 4096;
|
||||
op.buf = (uint8_t*)memalign_or_die(MEM_ALIGNMENT, 4096);
|
||||
memset(op.buf, 0xaa, 4096);
|
||||
test.exec_op(&op);
|
||||
assert(op.retval == op.len);
|
||||
|
||||
// Destroy and restart without sync
|
||||
printf("destroying\n");
|
||||
test.destroy_bs();
|
||||
test.data_disk->discard_buffers(true, 0);
|
||||
test.init();
|
||||
|
||||
// Check ENOENT
|
||||
printf("checking for ENOENT\n");
|
||||
blockstore_op_t op2;
|
||||
op2.opcode = BS_OP_READ;
|
||||
op2.oid = { .inode = 1, .stripe = 0 };
|
||||
op2.version = UINT64_MAX;
|
||||
op2.offset = 0;
|
||||
op2.len = 128*1024;
|
||||
op2.buf = (uint8_t*)memalign_or_die(MEM_ALIGNMENT, 128*1024);
|
||||
test.exec_op(&op2);
|
||||
assert(op2.retval == -ENOENT);
|
||||
|
||||
// Write again
|
||||
printf("writing again\n");
|
||||
test.exec_op(&op);
|
||||
assert(op.retval == op.len);
|
||||
|
||||
// Sync
|
||||
printf("version %ju written, syncing\n", op.version);
|
||||
op.opcode = BS_OP_SYNC;
|
||||
test.exec_op(&op);
|
||||
assert(op.retval == 0);
|
||||
|
||||
// Discard and restart again
|
||||
printf("destroying again\n");
|
||||
test.destroy_bs();
|
||||
test.data_disk->discard_buffers(true, 0);
|
||||
test.init();
|
||||
|
||||
// Check that it's present now
|
||||
printf("checking for OK\n");
|
||||
op2.version = UINT64_MAX;
|
||||
test.exec_op(&op2);
|
||||
assert(op2.retval == op2.len);
|
||||
assert(is_zero(op2.buf, 16*1024));
|
||||
assert(memcmp(op2.buf+16*1024, op.buf, 4*1024) == 0);
|
||||
assert(is_zero(op2.buf+20*1024, 108*1024));
|
||||
|
||||
free(op.buf);
|
||||
free(op2.buf);
|
||||
}
|
||||
|
||||
static void test_intent_over_unstable()
|
||||
{
|
||||
printf("\n-- test_intent_over_unstable\n");
|
||||
|
||||
bs_test_t test;
|
||||
test.default_cfg();
|
||||
test.init();
|
||||
|
||||
// Write
|
||||
printf("writing\n");
|
||||
blockstore_op_t op;
|
||||
op.opcode = BS_OP_WRITE;
|
||||
op.oid = { .inode = 1, .stripe = 0 };
|
||||
op.version = 1;
|
||||
op.offset = 20480;
|
||||
op.len = 4096;
|
||||
op.buf = (uint8_t*)memalign_or_die(MEM_ALIGNMENT, 4096);
|
||||
memset(op.buf, 0xaa, 4096);
|
||||
test.exec_op(&op);
|
||||
assert(op.retval == op.len);
|
||||
|
||||
// Write again
|
||||
printf("writing again\n");
|
||||
op.version = 2;
|
||||
op.offset = 28*1024;
|
||||
test.exec_op(&op);
|
||||
assert(op.retval == op.len);
|
||||
|
||||
free(op.buf);
|
||||
}
|
||||
|
||||
static void test_padded_csum_intent(bool perfect)
|
||||
{
|
||||
printf("\n-- test_padded_csum_intent%s\n", perfect ? " perfect_csum_update" : "");
|
||||
|
||||
bs_test_t test;
|
||||
test.default_cfg();
|
||||
test.config["csum_block_size"] = "16384";
|
||||
if (perfect)
|
||||
test.config["perfect_csum_update"] = "1";
|
||||
test.init();
|
||||
|
||||
// Write
|
||||
printf("writing\n");
|
||||
blockstore_op_t op;
|
||||
op.opcode = BS_OP_WRITE;
|
||||
op.oid = { .inode = 1, .stripe = 0 };
|
||||
op.version = 1;
|
||||
op.offset = 8192;
|
||||
op.len = 4096;
|
||||
op.buf = (uint8_t*)memalign_or_die(MEM_ALIGNMENT, 4096);
|
||||
memset(op.buf, 0xaa, 4096);
|
||||
test.exec_op(&op);
|
||||
assert(op.retval == op.len);
|
||||
|
||||
// Read
|
||||
printf("reading\n");
|
||||
blockstore_op_t op2;
|
||||
op2.opcode = BS_OP_READ;
|
||||
op2.oid = { .inode = 1, .stripe = 0 };
|
||||
op2.version = UINT64_MAX;
|
||||
op2.offset = 0;
|
||||
op2.len = 128*1024;
|
||||
op2.buf = (uint8_t*)memalign_or_die(MEM_ALIGNMENT, 128*1024);
|
||||
test.exec_op(&op2);
|
||||
assert(op2.retval == op2.len);
|
||||
assert(is_zero(op2.buf, 8*1024));
|
||||
assert(memcmp(op2.buf+8*1024, op.buf, 4*1024) == 0);
|
||||
assert(is_zero(op2.buf+12*1024, 116*1024));
|
||||
|
||||
// Write again (intent if not "perfect")
|
||||
printf("writing (%s)\n", perfect ? "small" : "intent");
|
||||
op.version = 2;
|
||||
op.offset = 28*1024;
|
||||
memset(op.buf, 0xbb, 4096);
|
||||
test.exec_op(&op);
|
||||
assert(op.retval == op.len);
|
||||
|
||||
// Write again (small because uncompactable)
|
||||
printf("writing (small)\n");
|
||||
op.version = 3;
|
||||
op.offset = 60*1024;
|
||||
memset(op.buf, 0xcc, 4096);
|
||||
test.exec_op(&op);
|
||||
assert(op.retval == op.len);
|
||||
|
||||
// Check that these are really big+intent+small writes
|
||||
// (intent is not collapsible because of csum_block_size > bitmap_granularity)
|
||||
heap_object_t *obj = test.bs->heap->read_entry((object_id){ .inode = 1, .stripe = 0 }, NULL);
|
||||
assert(obj);
|
||||
assert(obj->get_writes()->next());
|
||||
assert(obj->get_writes()->next()->next());
|
||||
assert(!obj->get_writes()->next()->next()->next());
|
||||
assert(obj->get_writes()->entry_type == BS_HEAP_SMALL_WRITE);
|
||||
assert(obj->get_writes()->next()->entry_type == (perfect ? BS_HEAP_SMALL_WRITE : BS_HEAP_INTENT_WRITE));
|
||||
assert(obj->get_writes()->next()->next()->entry_type == BS_HEAP_BIG_WRITE);
|
||||
|
||||
// Commit
|
||||
printf("commit version 3\n");
|
||||
op.opcode = BS_OP_STABLE;
|
||||
op.len = 1;
|
||||
*((obj_ver_id*)op.buf) = {
|
||||
.oid = { .inode = 1, .stripe = 0 },
|
||||
.version = 3,
|
||||
};
|
||||
test.exec_op(&op);
|
||||
assert(op.retval == 0);
|
||||
assert(test.bs->heap->get_compact_queue_size());
|
||||
|
||||
// Trigger & wait compaction
|
||||
test.bs->flusher->request_trim();
|
||||
while (test.bs->heap->get_compact_queue_size())
|
||||
test.ringloop->loop();
|
||||
while (test.bs->flusher->is_active())
|
||||
test.ringloop->loop();
|
||||
test.bs->flusher->release_trim();
|
||||
// Check that compaction succeeded
|
||||
assert(!test.bs->heap->get_to_compact_count());
|
||||
|
||||
// Read again and check
|
||||
printf("reading compacted\n");
|
||||
op2.version = UINT64_MAX;
|
||||
test.exec_op(&op2);
|
||||
assert(op2.retval == op2.len);
|
||||
assert(memcheck(op2.buf, 0, 8*1024));
|
||||
assert(memcheck(op2.buf+8*1024, 0xaa, 4*1024));
|
||||
assert(memcheck(op2.buf+12*1024, 0, 16*1024));
|
||||
assert(memcheck(op2.buf+28*1024, 0xbb, 4*1024));
|
||||
assert(memcheck(op2.buf+32*1024, 0, 28*1024));
|
||||
assert(memcheck(op2.buf+60*1024, 0xcc, 4*1024));
|
||||
assert(memcheck(op2.buf+64*1024, 0, 64*1024));
|
||||
|
||||
obj = test.bs->heap->read_entry((object_id){ .inode = 1, .stripe = 0 }, NULL);
|
||||
assert(!obj->get_writes()->next());
|
||||
|
||||
free(op.buf);
|
||||
free(op2.buf);
|
||||
}
|
||||
|
||||
static void test_padded_csum_parallel_read(bool perfect, uint32_t offset)
|
||||
{
|
||||
printf("\n-- test_padded_csum_parallel_read%s offset=%u\n", perfect ? " perfect_csum_update" : "", offset);
|
||||
|
||||
bs_test_t test;
|
||||
test.default_cfg();
|
||||
test.config["csum_block_size"] = "16384";
|
||||
test.config["atomic_write_size"] = "0";
|
||||
if (perfect)
|
||||
test.config["perfect_csum_update"] = "1";
|
||||
test.init();
|
||||
|
||||
// Write
|
||||
printf("writing (initial)\n");
|
||||
blockstore_op_t op;
|
||||
op.opcode = BS_OP_WRITE_STABLE;
|
||||
op.oid = { .inode = 1, .stripe = 0 };
|
||||
op.version = 1;
|
||||
op.offset = 8192;
|
||||
op.len = 16384;
|
||||
op.buf = (uint8_t*)memalign_or_die(MEM_ALIGNMENT, 16384);
|
||||
memset(op.buf, 0xaa, 16384);
|
||||
test.exec_op(&op);
|
||||
assert(op.retval == op.len);
|
||||
|
||||
// Write 2
|
||||
printf("writing (%u+%u)\n", offset, 4096);
|
||||
op.version = 2;
|
||||
op.offset = offset;
|
||||
op.len = 4096;
|
||||
memset(op.buf, 0xbb, 4096);
|
||||
test.exec_op(&op);
|
||||
assert(op.retval == op.len);
|
||||
|
||||
// Trigger & wait compaction
|
||||
test.bs->flusher->request_trim();
|
||||
std::vector<ring_data_t*> flush_writes;
|
||||
test.sqe_handler = [&](io_uring_sqe *sqe)
|
||||
{
|
||||
if (sqe->fd == MOCK_DATA_FD && sqe->opcode == IORING_OP_WRITEV &&
|
||||
sqe->off >= test.bs->dsk.data_offset)
|
||||
{
|
||||
bool ok = test.data_disk->submit(sqe);
|
||||
assert(ok);
|
||||
flush_writes.push_back((ring_data_t*)sqe->user_data);
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
};
|
||||
// Wait for 2 flusher writes, execute and pause them
|
||||
while (test.bs->heap->get_compact_queue_size() && flush_writes.size() < 1)
|
||||
test.ringloop->loop();
|
||||
while (test.bs->flusher->is_active() && flush_writes.size() < 1)
|
||||
test.ringloop->loop();
|
||||
// Run a read operation in parallel - it shouldn't complain about checksum errors
|
||||
printf("reading in parallel\n");
|
||||
blockstore_op_t op2;
|
||||
op2.opcode = BS_OP_READ;
|
||||
op2.oid = { .inode = 1, .stripe = 0 };
|
||||
op2.version = 1;
|
||||
op2.offset = 0;
|
||||
op2.len = 128*1024;
|
||||
op2.buf = (uint8_t*)memalign_or_die(MEM_ALIGNMENT, 128*1024);
|
||||
test.exec_op(&op2);
|
||||
assert(op2.retval == op2.len);
|
||||
// Continue flushing
|
||||
test.sqe_handler = NULL;
|
||||
for (auto & w: flush_writes)
|
||||
test.ringloop->mark_completed(w);
|
||||
flush_writes.clear();
|
||||
while (test.bs->heap->get_compact_queue_size() && flush_writes.size() < 2)
|
||||
test.ringloop->loop();
|
||||
while (test.bs->flusher->is_active() && flush_writes.size() < 2)
|
||||
test.ringloop->loop();
|
||||
test.bs->flusher->release_trim();
|
||||
// Check that compaction succeeded
|
||||
assert(!test.bs->heap->get_to_compact_count());
|
||||
|
||||
free(op.buf);
|
||||
free(op2.buf);
|
||||
}
|
||||
|
||||
int main(int narg, char *args[])
|
||||
{
|
||||
blockstore_config_t config;
|
||||
config["meta_device"] = "./test_meta.bin";
|
||||
config["journal_device"] = "./test_journal.bin";
|
||||
config["data_device"] = "./test_data.bin";
|
||||
ring_loop_t *ringloop = new ring_loop_t(RINGLOOP_DEFAULT_SIZE);
|
||||
epoll_manager_t *epmgr = new epoll_manager_t(ringloop);
|
||||
blockstore_t *bs = new blockstore_t(config, ringloop, epmgr->tfd);
|
||||
|
||||
blockstore_op_t op;
|
||||
int main_state = 0;
|
||||
uint64_t version = 0;
|
||||
ring_consumer_t main_cons;
|
||||
op.callback = [&](blockstore_op_t *op)
|
||||
{
|
||||
printf("op completed %d\n", op->retval);
|
||||
if (main_state == 1)
|
||||
main_state = 2;
|
||||
else if (main_state == 3)
|
||||
main_state = 4;
|
||||
else if (main_state == 5)
|
||||
main_state = 6;
|
||||
else if (main_state == 7)
|
||||
main_state = 8;
|
||||
else if (main_state == 9)
|
||||
main_state = 10;
|
||||
};
|
||||
main_cons.loop = [&]()
|
||||
{
|
||||
if (main_state == 0)
|
||||
{
|
||||
if (bs->is_started())
|
||||
{
|
||||
printf("init completed\n");
|
||||
op.opcode = BS_OP_WRITE;
|
||||
op.oid = { .inode = 1, .stripe = 0 };
|
||||
op.version = 0;
|
||||
op.offset = 16384;
|
||||
op.len = 4096;
|
||||
op.buf = (uint8_t*)memalign(512, 128*1024);
|
||||
memset(op.buf, 0xaa, 4096);
|
||||
bs->enqueue_op(&op);
|
||||
main_state = 1;
|
||||
}
|
||||
}
|
||||
else if (main_state == 2)
|
||||
{
|
||||
printf("version %ju written, syncing\n", op.version);
|
||||
version = op.version;
|
||||
op.opcode = BS_OP_SYNC;
|
||||
bs->enqueue_op(&op);
|
||||
main_state = 3;
|
||||
}
|
||||
else if (main_state == 4)
|
||||
{
|
||||
printf("stabilizing version %ju\n", version);
|
||||
op.opcode = BS_OP_STABLE;
|
||||
op.len = 1;
|
||||
*((obj_ver_id*)op.buf) = {
|
||||
.oid = { .inode = 1, .stripe = 0 },
|
||||
.version = version,
|
||||
};
|
||||
bs->enqueue_op(&op);
|
||||
main_state = 5;
|
||||
}
|
||||
else if (main_state == 6)
|
||||
{
|
||||
printf("stabilizing version %ju\n", version);
|
||||
op.opcode = BS_OP_STABLE;
|
||||
op.len = 1;
|
||||
*((obj_ver_id*)op.buf) = {
|
||||
.oid = { .inode = 1, .stripe = 0 },
|
||||
.version = version,
|
||||
};
|
||||
bs->enqueue_op(&op);
|
||||
main_state = 7;
|
||||
}
|
||||
else if (main_state == 8)
|
||||
{
|
||||
printf("reading 0-128K\n");
|
||||
op.opcode = BS_OP_READ;
|
||||
op.oid = { .inode = 1, .stripe = 0 };
|
||||
op.version = UINT64_MAX;
|
||||
op.offset = 0;
|
||||
op.len = 128*1024;
|
||||
bs->enqueue_op(&op);
|
||||
main_state = 9;
|
||||
}
|
||||
else if (main_state == 10)
|
||||
{
|
||||
void *cmp = memalign(512, 128*1024);
|
||||
memset(cmp, 0, 128*1024);
|
||||
memset(cmp+16384, 0xaa, 4096);
|
||||
int ok = 1;
|
||||
for (int i = 0; i < 128*1024; i += 4096)
|
||||
{
|
||||
if (memcmp(cmp+i, op.buf+i, 4096) != 0)
|
||||
{
|
||||
printf("bitmap works incorrectly, bytes %d - %d differ (%02x, should be %02x)\n", i, i+4096, ((uint8_t*)op.buf)[i], ((uint8_t*)cmp)[i]);
|
||||
ok = 0;
|
||||
}
|
||||
}
|
||||
if (ok)
|
||||
printf("bitmap works correctly\n");
|
||||
free(cmp);
|
||||
main_state = 11;
|
||||
}
|
||||
};
|
||||
|
||||
ringloop->register_consumer(&main_cons);
|
||||
while (1)
|
||||
{
|
||||
ringloop->loop();
|
||||
ringloop->wait();
|
||||
}
|
||||
delete bs;
|
||||
delete epmgr;
|
||||
delete ringloop;
|
||||
test_simple();
|
||||
test_fsync(false);
|
||||
test_fsync(true);
|
||||
test_intent_over_unstable();
|
||||
test_padded_csum_intent(false);
|
||||
test_padded_csum_intent(true);
|
||||
test_padded_csum_parallel_read(false, 8192);
|
||||
test_padded_csum_parallel_read(true, 8192);
|
||||
test_padded_csum_parallel_read(false, 16384);
|
||||
test_padded_csum_parallel_read(true, 16384);
|
||||
return 0;
|
||||
}
|
||||
|
1676
src/test/test_heap.cpp
Normal file
1676
src/test/test_heap.cpp
Normal file
File diff suppressed because it is too large
Load Diff
@@ -8,6 +8,11 @@
|
||||
|
||||
#pragma GCC visibility push(default)
|
||||
|
||||
// Memory allocation alignment (page size is usually optimal)
|
||||
#ifndef MEM_ALIGNMENT
|
||||
#define MEM_ALIGNMENT 4096
|
||||
#endif
|
||||
|
||||
inline void* memalign_or_die(size_t alignment, size_t size)
|
||||
{
|
||||
void *buf = memalign(alignment, size);
|
||||
|
@@ -192,7 +192,7 @@ void ring_loop_t::restore(unsigned sqe_tail)
|
||||
ring.sq.sqe_tail = sqe_tail;
|
||||
}
|
||||
|
||||
int ring_loop_t::sqes_left()
|
||||
unsigned ring_loop_t::space_left()
|
||||
{
|
||||
struct io_uring_sq *sq = &ring.sq;
|
||||
unsigned int head = io_uring_smp_load_acquire(sq->khead);
|
||||
|
@@ -14,6 +14,7 @@
|
||||
#include <string>
|
||||
#include <functional>
|
||||
#include <vector>
|
||||
#include <map>
|
||||
#include <mutex>
|
||||
|
||||
#define RINGLOOP_DEFAULT_SIZE 1024
|
||||
@@ -32,7 +33,27 @@ struct ring_consumer_t
|
||||
std::function<void(void)> loop;
|
||||
};
|
||||
|
||||
class __attribute__((visibility("default"))) ring_loop_t
|
||||
class __attribute__((visibility("default"))) ring_loop_i
|
||||
{
|
||||
public:
|
||||
virtual ~ring_loop_i() = default;
|
||||
virtual void register_consumer(ring_consumer_t *consumer) = 0;
|
||||
virtual void unregister_consumer(ring_consumer_t *consumer) = 0;
|
||||
virtual int register_eventfd() = 0;
|
||||
virtual io_uring_sqe* get_sqe() = 0;
|
||||
virtual void set_immediate(const std::function<void()> & cb) = 0;
|
||||
virtual int submit() = 0;
|
||||
virtual int wait() = 0;
|
||||
virtual unsigned space_left() = 0;
|
||||
virtual bool has_work() = 0;
|
||||
virtual bool has_sendmsg_zc() = 0;
|
||||
virtual void loop() = 0;
|
||||
virtual void wakeup() = 0;
|
||||
virtual unsigned save() = 0;
|
||||
virtual void restore(unsigned sqe_tail) = 0;
|
||||
};
|
||||
|
||||
class __attribute__((visibility("default"))) ring_loop_t: public ring_loop_i
|
||||
{
|
||||
std::vector<std::function<void()>> immediate_queue, immediate_queue2;
|
||||
std::vector<ring_consumer_t*> consumers;
|
||||
@@ -54,7 +75,7 @@ public:
|
||||
int register_eventfd();
|
||||
|
||||
io_uring_sqe* get_sqe();
|
||||
inline void set_immediate(const std::function<void()> cb)
|
||||
inline void set_immediate(const std::function<void()> & cb)
|
||||
{
|
||||
immediate_queue.push_back(cb);
|
||||
wakeup();
|
||||
@@ -68,11 +89,7 @@ public:
|
||||
struct io_uring_cqe *cqe;
|
||||
return io_uring_wait_cqe(&ring, &cqe);
|
||||
}
|
||||
int sqes_left();
|
||||
inline unsigned space_left()
|
||||
{
|
||||
return free_ring_data_ptr;
|
||||
}
|
||||
unsigned space_left();
|
||||
inline bool has_work()
|
||||
{
|
||||
return loop_again;
|
||||
|
2544
src/util/robin_hood.h
Normal file
2544
src/util/robin_hood.h
Normal file
File diff suppressed because it is too large
Load Diff
@@ -30,7 +30,7 @@ std::string base64_encode(const std::string &in)
|
||||
return out;
|
||||
}
|
||||
|
||||
static char T[256] = { 0 };
|
||||
static int T[256] = { 0 };
|
||||
|
||||
std::string base64_decode(const std::string &in)
|
||||
{
|
||||
|
@@ -4,6 +4,7 @@
|
||||
#include <sys/timerfd.h>
|
||||
#include <sys/poll.h>
|
||||
#include <sys/epoll.h>
|
||||
#include <assert.h>
|
||||
#include <unistd.h>
|
||||
#include <errno.h>
|
||||
#include <string.h>
|
||||
@@ -15,21 +16,27 @@ timerfd_manager_t::timerfd_manager_t(std::function<void(int, bool, std::function
|
||||
{
|
||||
this->set_fd_handler = set_fd_handler;
|
||||
wait_state = 0;
|
||||
timerfd = timerfd_create(CLOCK_MONOTONIC, TFD_NONBLOCK);
|
||||
if (timerfd < 0)
|
||||
if (set_fd_handler)
|
||||
{
|
||||
throw std::runtime_error(std::string("timerfd_create: ") + strerror(errno));
|
||||
timerfd = timerfd_create(CLOCK_MONOTONIC, TFD_NONBLOCK);
|
||||
if (timerfd < 0)
|
||||
{
|
||||
throw std::runtime_error(std::string("timerfd_create: ") + strerror(errno));
|
||||
}
|
||||
set_fd_handler(timerfd, false, [this](int fd, int events)
|
||||
{
|
||||
handle_readable();
|
||||
});
|
||||
}
|
||||
set_fd_handler(timerfd, false, [this](int fd, int events)
|
||||
{
|
||||
handle_readable();
|
||||
});
|
||||
}
|
||||
|
||||
timerfd_manager_t::~timerfd_manager_t()
|
||||
{
|
||||
set_fd_handler(timerfd, false, NULL);
|
||||
close(timerfd);
|
||||
if (timerfd >= 0)
|
||||
{
|
||||
set_fd_handler(timerfd, false, NULL);
|
||||
close(timerfd);
|
||||
}
|
||||
}
|
||||
|
||||
void timerfd_manager_t::inc_timer(timerfd_timer_t & t)
|
||||
@@ -52,7 +59,14 @@ int timerfd_manager_t::set_timer_us(uint64_t micros, bool repeat, std::function<
|
||||
{
|
||||
int timer_id = id++;
|
||||
timespec start;
|
||||
clock_gettime(CLOCK_MONOTONIC, &start);
|
||||
if (timerfd >= 0)
|
||||
{
|
||||
clock_gettime(CLOCK_MONOTONIC, &start);
|
||||
}
|
||||
else
|
||||
{
|
||||
start = cur;
|
||||
}
|
||||
timers.push_back({
|
||||
.id = timer_id,
|
||||
.micros = micros,
|
||||
@@ -101,7 +115,7 @@ again:
|
||||
{
|
||||
nearest = -1;
|
||||
itimerspec exp = {};
|
||||
if (timerfd_settime(timerfd, 0, &exp, NULL))
|
||||
if (timerfd >= 0 && timerfd_settime(timerfd, 0, &exp, NULL))
|
||||
{
|
||||
throw std::runtime_error(std::string("timerfd_settime: ") + strerror(errno));
|
||||
}
|
||||
@@ -120,7 +134,14 @@ again:
|
||||
}
|
||||
}
|
||||
timespec now;
|
||||
clock_gettime(CLOCK_MONOTONIC, &now);
|
||||
if (timerfd >= 0)
|
||||
{
|
||||
clock_gettime(CLOCK_MONOTONIC, &now);
|
||||
}
|
||||
else
|
||||
{
|
||||
now = cur;
|
||||
}
|
||||
itimerspec exp = {
|
||||
.it_interval = { 0 },
|
||||
.it_value = timers[nearest].next,
|
||||
@@ -142,7 +163,7 @@ again:
|
||||
}
|
||||
exp.it_value = { .tv_sec = 0, .tv_nsec = 1 };
|
||||
}
|
||||
if (timerfd_settime(timerfd, 0, &exp, NULL))
|
||||
if (timerfd >= 0 && timerfd_settime(timerfd, 0, &exp, NULL))
|
||||
{
|
||||
throw std::runtime_error(std::string("timerfd_settime: ") + strerror(errno));
|
||||
}
|
||||
@@ -178,3 +199,13 @@ void timerfd_manager_t::trigger_nearest()
|
||||
nearest = -1;
|
||||
cb(nearest_id);
|
||||
}
|
||||
|
||||
void timerfd_manager_t::tick(timespec passed)
|
||||
{
|
||||
assert(timerfd == -1);
|
||||
cur.tv_sec += passed.tv_sec;
|
||||
cur.tv_nsec += passed.tv_nsec;
|
||||
cur.tv_sec += (cur.tv_nsec / 1000000000);
|
||||
cur.tv_nsec = (cur.tv_nsec % 1000000000);
|
||||
set_nearest(true);
|
||||
}
|
||||
|
@@ -19,11 +19,12 @@ struct timerfd_timer_t
|
||||
class __attribute__((visibility("default"))) timerfd_manager_t
|
||||
{
|
||||
int wait_state = 0;
|
||||
int timerfd;
|
||||
int timerfd = -1;
|
||||
int nearest = -1;
|
||||
int id = 1;
|
||||
int onstack = 0;
|
||||
std::vector<timerfd_timer_t> timers;
|
||||
timespec cur = {};
|
||||
|
||||
void inc_timer(timerfd_timer_t & t);
|
||||
void set_nearest(bool trigger_inline);
|
||||
@@ -37,4 +38,5 @@ public:
|
||||
int set_timer(uint64_t millis, bool repeat, std::function<void(int)> callback);
|
||||
int set_timer_us(uint64_t micros, bool repeat, std::function<void(int)> callback);
|
||||
void clear_timer(int timer_id);
|
||||
void tick(timespec passed);
|
||||
};
|
||||
|
237
src/util/wyhash.h
Normal file
237
src/util/wyhash.h
Normal file
@@ -0,0 +1,237 @@
|
||||
// Copied from https://github.com/martinus/unordered_dense, version 4.5.0
|
||||
// Licensed under the MIT License <http://opensource.org/licenses/MIT>.
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2022-2024 Martin Leitner-Ankerl <martin.ankerl@gmail.com>
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdint> // for uint64_t, uint32_t, uint8_t, UINT64_C
|
||||
#include <cstring> // for size_t, memcpy, memset
|
||||
#include <functional> // for equal_to, hash
|
||||
#include <memory> // for allocator, allocator_traits, shared_ptr
|
||||
|
||||
#if defined(__GNUC__) || defined(__INTEL_COMPILER) || defined(__clang__)
|
||||
# define ANKERL_UNORDERED_DENSE_LIKELY(x) __builtin_expect(x, 1) // NOLINT(cppcoreguidelines-macro-usage)
|
||||
# define ANKERL_UNORDERED_DENSE_UNLIKELY(x) __builtin_expect(x, 0) // NOLINT(cppcoreguidelines-macro-usage)
|
||||
#else
|
||||
# define ANKERL_UNORDERED_DENSE_LIKELY(x) (x) // NOLINT(cppcoreguidelines-macro-usage)
|
||||
# define ANKERL_UNORDERED_DENSE_UNLIKELY(x) (x) // NOLINT(cppcoreguidelines-macro-usage)
|
||||
#endif
|
||||
|
||||
// This is a stripped-down implementation of wyhash: https://github.com/wangyi-fudan/wyhash
|
||||
// No big-endian support (because different values on different machines don't matter),
|
||||
// hardcodes seed and the secret, reformats the code, and clang-tidy fixes.
|
||||
namespace wyhash {
|
||||
|
||||
namespace detail {
|
||||
|
||||
inline void mum(uint64_t* a, uint64_t* b) {
|
||||
# if defined(__SIZEOF_INT128__)
|
||||
__uint128_t r = *a;
|
||||
r *= *b;
|
||||
*a = static_cast<uint64_t>(r);
|
||||
*b = static_cast<uint64_t>(r >> 64U);
|
||||
# elif defined(_MSC_VER) && defined(_M_X64)
|
||||
*a = _umul128(*a, *b, b);
|
||||
# else
|
||||
uint64_t ha = *a >> 32U;
|
||||
uint64_t hb = *b >> 32U;
|
||||
uint64_t la = static_cast<uint32_t>(*a);
|
||||
uint64_t lb = static_cast<uint32_t>(*b);
|
||||
uint64_t hi{};
|
||||
uint64_t lo{};
|
||||
uint64_t rh = ha * hb;
|
||||
uint64_t rm0 = ha * lb;
|
||||
uint64_t rm1 = hb * la;
|
||||
uint64_t rl = la * lb;
|
||||
uint64_t t = rl + (rm0 << 32U);
|
||||
auto c = static_cast<uint64_t>(t < rl);
|
||||
lo = t + (rm1 << 32U);
|
||||
c += static_cast<uint64_t>(lo < t);
|
||||
hi = rh + (rm0 >> 32U) + (rm1 >> 32U) + c;
|
||||
*a = lo;
|
||||
*b = hi;
|
||||
# endif
|
||||
}
|
||||
|
||||
// multiply and xor mix function, aka MUM
|
||||
inline auto mix(uint64_t a, uint64_t b) -> uint64_t {
|
||||
mum(&a, &b);
|
||||
return a ^ b;
|
||||
}
|
||||
|
||||
// read functions. WARNING: we don't care about endianness, so results are different on big endian!
|
||||
inline auto r8(const uint8_t* p) -> uint64_t {
|
||||
uint64_t v{};
|
||||
std::memcpy(&v, p, 8U);
|
||||
return v;
|
||||
}
|
||||
|
||||
inline auto r4(const uint8_t* p) -> uint64_t {
|
||||
uint32_t v{};
|
||||
std::memcpy(&v, p, 4);
|
||||
return v;
|
||||
}
|
||||
|
||||
// reads 1, 2, or 3 bytes
|
||||
inline auto r3(const uint8_t* p, size_t k) -> uint64_t {
|
||||
return (static_cast<uint64_t>(p[0]) << 16U) | (static_cast<uint64_t>(p[k >> 1U]) << 8U) | p[k - 1];
|
||||
}
|
||||
|
||||
inline auto hash(void const* key, size_t len) -> uint64_t {
|
||||
static uint64_t secret[4] = {UINT64_C(0xa0761d6478bd642f),
|
||||
UINT64_C(0xe7037ed1a0b428db),
|
||||
UINT64_C(0x8ebc6af09c88c6e3),
|
||||
UINT64_C(0x589965cc75374cc3)};
|
||||
|
||||
auto const* p = static_cast<uint8_t const*>(key);
|
||||
uint64_t seed = secret[0];
|
||||
uint64_t a{};
|
||||
uint64_t b{};
|
||||
if (ANKERL_UNORDERED_DENSE_LIKELY(len <= 16)) {
|
||||
if (ANKERL_UNORDERED_DENSE_LIKELY(len >= 4)) {
|
||||
a = (r4(p) << 32U) | r4(p + ((len >> 3U) << 2U));
|
||||
b = (r4(p + len - 4) << 32U) | r4(p + len - 4 - ((len >> 3U) << 2U));
|
||||
} else if (ANKERL_UNORDERED_DENSE_LIKELY(len > 0)) {
|
||||
a = r3(p, len);
|
||||
b = 0;
|
||||
} else {
|
||||
a = 0;
|
||||
b = 0;
|
||||
}
|
||||
} else {
|
||||
size_t i = len;
|
||||
if (ANKERL_UNORDERED_DENSE_UNLIKELY(i > 48)) {
|
||||
uint64_t see1 = seed;
|
||||
uint64_t see2 = seed;
|
||||
do {
|
||||
seed = mix(r8(p) ^ secret[1], r8(p + 8) ^ seed);
|
||||
see1 = mix(r8(p + 16) ^ secret[2], r8(p + 24) ^ see1);
|
||||
see2 = mix(r8(p + 32) ^ secret[3], r8(p + 40) ^ see2);
|
||||
p += 48;
|
||||
i -= 48;
|
||||
} while (ANKERL_UNORDERED_DENSE_LIKELY(i > 48));
|
||||
seed ^= see1 ^ see2;
|
||||
}
|
||||
while (ANKERL_UNORDERED_DENSE_UNLIKELY(i > 16)) {
|
||||
seed = mix(r8(p) ^ secret[1], r8(p + 8) ^ seed);
|
||||
i -= 16;
|
||||
p += 16;
|
||||
}
|
||||
a = r8(p + i - 16);
|
||||
b = r8(p + i - 8);
|
||||
}
|
||||
|
||||
return mix(secret[1] ^ len, mix(a ^ secret[1], b ^ seed));
|
||||
}
|
||||
|
||||
inline auto hash(uint64_t x) -> uint64_t {
|
||||
return mix(x, UINT64_C(0x9E3779B97F4A7C15));
|
||||
}
|
||||
|
||||
} // namespace detail
|
||||
|
||||
template <typename T, typename Enable = void>
|
||||
struct hash {
|
||||
auto operator()(T const& obj) const noexcept(noexcept(std::declval<std::hash<T>>().operator()(std::declval<T const&>())))
|
||||
-> uint64_t {
|
||||
return std::hash<T>{}(obj);
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
struct hash<T, typename std::hash<T>::is_avalanching> {
|
||||
using is_avalanching = void;
|
||||
auto operator()(T const& obj) const noexcept(noexcept(std::declval<std::hash<T>>().operator()(std::declval<T const&>())))
|
||||
-> uint64_t {
|
||||
return std::hash<T>{}(obj);
|
||||
}
|
||||
};
|
||||
|
||||
template <typename CharT>
|
||||
struct hash<std::basic_string<CharT>> {
|
||||
using is_avalanching = void;
|
||||
auto operator()(std::basic_string<CharT> const& str) const noexcept -> uint64_t {
|
||||
return detail::hash(str.data(), sizeof(CharT) * str.size());
|
||||
}
|
||||
};
|
||||
|
||||
template <class T>
|
||||
struct hash<T*> {
|
||||
using is_avalanching = void;
|
||||
auto operator()(T* ptr) const noexcept -> uint64_t {
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-pro-type-reinterpret-cast)
|
||||
return detail::hash(reinterpret_cast<uintptr_t>(ptr));
|
||||
}
|
||||
};
|
||||
|
||||
template <class T>
|
||||
struct hash<std::unique_ptr<T>> {
|
||||
using is_avalanching = void;
|
||||
auto operator()(std::unique_ptr<T> const& ptr) const noexcept -> uint64_t {
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-pro-type-reinterpret-cast)
|
||||
return detail::hash(reinterpret_cast<uintptr_t>(ptr.get()));
|
||||
}
|
||||
};
|
||||
|
||||
template <class T>
|
||||
struct hash<std::shared_ptr<T>> {
|
||||
using is_avalanching = void;
|
||||
auto operator()(std::shared_ptr<T> const& ptr) const noexcept -> uint64_t {
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-pro-type-reinterpret-cast)
|
||||
return detail::hash(reinterpret_cast<uintptr_t>(ptr.get()));
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Enum>
|
||||
struct hash<Enum, typename std::enable_if<std::is_enum<Enum>::value>::type> {
|
||||
using is_avalanching = void;
|
||||
auto operator()(Enum e) const noexcept -> uint64_t {
|
||||
using underlying = typename std::underlying_type_t<Enum>;
|
||||
return detail::hash(static_cast<underlying>(e));
|
||||
}
|
||||
};
|
||||
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
|
||||
# define ANKERL_UNORDERED_DENSE_HASH_STATICCAST(T) \
|
||||
template <> \
|
||||
struct hash<T> { \
|
||||
using is_avalanching = void; \
|
||||
auto operator()(T const& obj) const noexcept -> uint64_t { \
|
||||
return detail::hash(static_cast<uint64_t>(obj)); \
|
||||
} \
|
||||
}
|
||||
|
||||
# if defined(__GNUC__) && !defined(__clang__)
|
||||
# pragma GCC diagnostic push
|
||||
# pragma GCC diagnostic ignored "-Wuseless-cast"
|
||||
# endif
|
||||
// see https://en.cppreference.com/w/cpp/utility/hash
|
||||
ANKERL_UNORDERED_DENSE_HASH_STATICCAST(bool);
|
||||
ANKERL_UNORDERED_DENSE_HASH_STATICCAST(char);
|
||||
ANKERL_UNORDERED_DENSE_HASH_STATICCAST(signed char);
|
||||
ANKERL_UNORDERED_DENSE_HASH_STATICCAST(unsigned char);
|
||||
# if ANKERL_UNORDERED_DENSE_CPP_VERSION >= 202002L && defined(__cpp_char8_t)
|
||||
ANKERL_UNORDERED_DENSE_HASH_STATICCAST(char8_t);
|
||||
# endif
|
||||
ANKERL_UNORDERED_DENSE_HASH_STATICCAST(char16_t);
|
||||
ANKERL_UNORDERED_DENSE_HASH_STATICCAST(char32_t);
|
||||
ANKERL_UNORDERED_DENSE_HASH_STATICCAST(wchar_t);
|
||||
ANKERL_UNORDERED_DENSE_HASH_STATICCAST(short);
|
||||
ANKERL_UNORDERED_DENSE_HASH_STATICCAST(unsigned short);
|
||||
ANKERL_UNORDERED_DENSE_HASH_STATICCAST(int);
|
||||
ANKERL_UNORDERED_DENSE_HASH_STATICCAST(unsigned int);
|
||||
ANKERL_UNORDERED_DENSE_HASH_STATICCAST(long);
|
||||
ANKERL_UNORDERED_DENSE_HASH_STATICCAST(long long);
|
||||
ANKERL_UNORDERED_DENSE_HASH_STATICCAST(unsigned long);
|
||||
ANKERL_UNORDERED_DENSE_HASH_STATICCAST(unsigned long long);
|
||||
|
||||
# undef ANKERL_UNORDERED_DENSE_HASH_STATICCAST
|
||||
# undef ANKERL_UNORDERED_DENSE_LIKELY
|
||||
# undef ANKERL_UNORDERED_DENSE_UNLIKELY
|
||||
|
||||
# if defined(__GNUC__) && !defined(__clang__)
|
||||
# pragma GCC diagnostic pop
|
||||
# endif
|
||||
|
||||
} // namespace wyhash
|
@@ -70,6 +70,8 @@ TEST_NAME=local_read POOLCFG='"local_reads":"random",' ./test_heal.sh
|
||||
SCHEME=ec ./test_heal.sh
|
||||
ANTIETCD=1 ./test_heal.sh
|
||||
|
||||
./test_reweight_half.sh
|
||||
|
||||
TEST_NAME=csum_32k_dmj OSD_ARGS="--data_csum_type crc32c --csum_block_size 32k --inmemory_metadata false --inmemory_journal false" OFFSET_ARGS=$OSD_ARGS ./test_heal.sh
|
||||
TEST_NAME=csum_32k_dj OSD_ARGS="--data_csum_type crc32c --csum_block_size 32k --inmemory_journal false" OFFSET_ARGS=$OSD_ARGS ./test_heal.sh
|
||||
TEST_NAME=csum_32k OSD_ARGS="--data_csum_type crc32c --csum_block_size 32k" OFFSET_ARGS=$OSD_ARGS ./test_heal.sh
|
||||
|
@@ -25,26 +25,33 @@ done
|
||||
|
||||
for i in $(seq 1 $OSD_COUNT); do
|
||||
offsets=$(build/src/disk_tool/vitastor-disk simple-offsets --format json ./testdata/bin/test_osd$i.bin)
|
||||
opts=$(build/src/disk_tool/vitastor-disk simple-offsets --format options ./testdata/bin/test_osd$i.bin)
|
||||
meta_offset=$(echo $offsets | jq -r .meta_offset)
|
||||
data_offset=$(echo $offsets | jq -r .data_offset)
|
||||
build/src/disk_tool/vitastor-disk dump-journal --io cached --json ./testdata/bin/test_osd$i.bin 4096 0 $meta_offset >./testdata/journal_before_resize.json
|
||||
build/src/disk_tool/vitastor-disk dump-meta --io cached ./testdata/bin/test_osd$i.bin 4096 $meta_offset $((data_offset-meta_offset)) >./testdata/meta_before_resize.json
|
||||
#build/src/disk_tool/vitastor-disk dump-journal --io cached --json ./testdata/bin/test_osd$i.bin 4096 0 $meta_offset >./testdata/journal_before_resize.json
|
||||
#build/src/disk_tool/vitastor-disk dump-meta --io cached ./testdata/bin/test_osd$i.bin 4096 $meta_offset $((data_offset-meta_offset)) >./testdata/meta_before_resize.json
|
||||
build/src/disk_tool/vitastor-disk dump-meta --io cached $opts >./testdata/meta_before_resize.json
|
||||
new_data_offset=$((128*1024*1024+data_offset%131072))
|
||||
build/src/disk_tool/vitastor-disk raw-resize --io cached \
|
||||
$(build/src/disk_tool/vitastor-disk simple-offsets --format options ./testdata/bin/test_osd$i.bin 2>/dev/null) \
|
||||
$opts \
|
||||
--new_meta_offset 0 \
|
||||
--new_meta_len $((1024*1024)) \
|
||||
--new_journal_offset $((1024*1024)) \
|
||||
--new_data_offset $((128*1024*1024+32768))
|
||||
build/src/disk_tool/vitastor-disk dump-journal --io cached --json ./testdata/bin/test_osd$i.bin 4096 $((1024*1024)) $((127*1024*1024)) >./testdata/journal_after_resize.json
|
||||
build/src/disk_tool/vitastor-disk dump-meta --io cached ./testdata/bin/test_osd$i.bin 4096 0 $((1024*1024)) >./testdata/meta_after_resize.json
|
||||
--new_data_offset $new_data_offset
|
||||
#build/src/disk_tool/vitastor-disk dump-journal --io cached --json ./testdata/bin/test_osd$i.bin 4096 $((1024*1024)) $((127*1024*1024)) >./testdata/journal_after_resize.json
|
||||
build/src/disk_tool/vitastor-disk dump-meta --io cached $opts \
|
||||
--meta_offset 0 \
|
||||
--meta_len $((1024*1024)) \
|
||||
--journal_offset $((1024*1024)) \
|
||||
--data_offset $new_data_offset >./testdata/meta_after_resize.json
|
||||
if ! (cat ./testdata/meta_before_resize.json ./testdata/meta_after_resize.json | \
|
||||
jq -e -s 'map([ .entries[] | del(.block) ] | sort_by(.pool, .inode, .stripe)) | .[0] == .[1] and (.[0] | length) > 1000'); then
|
||||
jq -e -s 'map([ .entries[] | del(.block, .writes[].location) ] | sort_by(.pool, .inode, .stripe)) | .[0] == .[1] and (.[0] | length) > 1000'); then
|
||||
format_error "OSD $i metadata corrupted after resizing"
|
||||
fi
|
||||
if ! (cat ./testdata/journal_before_resize.json ./testdata/journal_after_resize.json | \
|
||||
jq -e -s 'map([ .[] | del(.crc32, .crc32_prev, .valid, .loc, .start) ]) | .[0] == .[1] and (.[0] | length) > 1'); then
|
||||
format_error "OSD $i journal corrupted after resizing"
|
||||
fi
|
||||
#if ! (cat ./testdata/journal_before_resize.json ./testdata/journal_after_resize.json | \
|
||||
# jq -e -s 'map([ .[] | del(.crc32, .crc32_prev, .valid, .loc, .start) ]) | .[0] == .[1] and (.[0] | length) > 1'); then
|
||||
# format_error "OSD $i journal corrupted after resizing"
|
||||
#fi
|
||||
done
|
||||
|
||||
$ETCDCTL del --prefix /vitastor/osd/state/
|
||||
@@ -54,7 +61,7 @@ for i in $(seq 1 $OSD_COUNT); do
|
||||
--data_device ./testdata/bin/test_osd$i.bin \
|
||||
--meta_offset 0 \
|
||||
--journal_offset $((1024*1024)) \
|
||||
--data_offset $((128*1024*1024+32768)) >>./testdata/osd$i.log 2>&1 &
|
||||
--data_offset $new_data_offset >>./testdata/osd$i.log 2>&1 &
|
||||
eval OSD${i}_PID=$!
|
||||
done
|
||||
|
||||
|
@@ -15,7 +15,7 @@ trap "kill -9 $(jobs -p) || true; sudo losetup -d $LOOP1 $LOOP2"' || true' EXIT
|
||||
# also test prepare --hybrid :)
|
||||
# non-vitastor random type UUID to prevent udev activation
|
||||
mount | grep '/dev type devtmpfs' || sudo mount udev /dev/ -t devtmpfs
|
||||
sudo build/src/disk_tool/vitastor-disk-test prepare --no_init 1 --meta_reserve 1x,1M \
|
||||
sudo build/src/disk_tool/vitastor-disk-test prepare --meta_format 2 --no_init 1 --meta_reserve 1x,1M \
|
||||
--block_size 131072 --osd_num 987654 --part_type_uuid 0df42ae0-3695-4395-a957-7d5ff3645c56 \
|
||||
--hybrid --fast-devices $LOOP2 $LOOP1
|
||||
|
||||
|
41
tests/test_reweight_half.sh
Executable file
41
tests/test_reweight_half.sh
Executable file
@@ -0,0 +1,41 @@
|
||||
#!/bin/bash -ex
|
||||
|
||||
. `dirname $0`/common.sh
|
||||
|
||||
node mon/mon-main.js $MON_PARAMS --etcd_address $ETCD_URL --etcd_prefix "/vitastor" >>./testdata/mon.log 2>&1 &
|
||||
MON_PID=$!
|
||||
wait_etcd
|
||||
|
||||
TIME=$(date '+%s')
|
||||
$ETCDCTL put /vitastor/osd/stats/1 '{"host":"host1","size":1073741824,"time":"'$TIME'"}'
|
||||
$ETCDCTL put /vitastor/osd/stats/2 '{"host":"host1","size":1073741824,"time":"'$TIME'"}'
|
||||
$ETCDCTL put /vitastor/osd/stats/3 '{"host":"host2","size":1073741824,"time":"'$TIME'"}'
|
||||
$ETCDCTL put /vitastor/osd/stats/4 '{"host":"host2","size":1073741824,"time":"'$TIME'"}'
|
||||
build/src/cmd/vitastor-cli --etcd_address $ETCD_URL create-pool testpool -s 2 -n 16 --force
|
||||
|
||||
sleep 2
|
||||
|
||||
# check that all OSDs have 8 PGs
|
||||
$ETCDCTL get /vitastor/pg/config --print-value-only | \
|
||||
jq -s -e '([ .[0].items["1"] | .[].osd_set | map_values(. | tonumber) | select(.[0] == 1 or .[1] == 1) ] | length) == 8'
|
||||
$ETCDCTL get /vitastor/pg/config --print-value-only | \
|
||||
jq -s -e '([ .[0].items["1"] | .[].osd_set | map_values(. | tonumber) | select(.[0] == 2 or .[1] == 2) ] | length) == 8'
|
||||
$ETCDCTL get /vitastor/pg/config --print-value-only | \
|
||||
jq -s -e '([ .[0].items["1"] | .[].osd_set | map_values(. | tonumber) | select(.[0] == 3 or .[1] == 3) ] | length) == 8'
|
||||
$ETCDCTL get /vitastor/pg/config --print-value-only | \
|
||||
jq -s -e '([ .[0].items["1"] | .[].osd_set | map_values(. | tonumber) | select(.[0] == 4 or .[1] == 4) ] | length) == 8'
|
||||
|
||||
build/src/cmd/vitastor-cli --etcd_address $ETCD_URL modify-osd --reweight 0.5 3
|
||||
|
||||
sleep 2
|
||||
|
||||
$ETCDCTL get /vitastor/pg/config --print-value-only | \
|
||||
jq -s -e '([ .[0].items["1"] | .[].osd_set | map_values(. | tonumber) | select(.[0] == 1 or .[1] == 1) ] | length) == 8'
|
||||
$ETCDCTL get /vitastor/pg/config --print-value-only | \
|
||||
jq -s -e '([ .[0].items["1"] | .[].osd_set | map_values(. | tonumber) | select(.[0] == 2 or .[1] == 2) ] | length) == 8'
|
||||
$ETCDCTL get /vitastor/pg/config --print-value-only | \
|
||||
jq -s -e '([ .[0].items["1"] | .[].osd_set | map_values(. | tonumber) | select(.[0] == 3 or .[1] == 3) ] | length) <= 6'
|
||||
$ETCDCTL get /vitastor/pg/config --print-value-only | \
|
||||
jq -s -e '([ .[0].items["1"] | .[].osd_set | map_values(. | tonumber) | select(.[0] == 4 or .[1] == 4) ] | length) >= 10'
|
||||
|
||||
format_green OK
|
@@ -7,6 +7,8 @@ if [[ ("$SCHEME" = "" || "$SCHEME" = "replicated") && ("$PG_SIZE" = "" || "$PG_S
|
||||
OSD_COUNT=2
|
||||
fi
|
||||
|
||||
OSD_ARGS="--scrub_list_limit 1000 $OSD_ARGS"
|
||||
|
||||
. `dirname $0`/run_3osds.sh
|
||||
|
||||
check_qemu
|
||||
|
Reference in New Issue
Block a user