Compare commits

..

8 Commits

Author SHA1 Message Date
81fc8bb94c Release 0.8.4
New features:
- Implement QCOW2 image/snapshot export via qemu-img (bdrv_co_block_status in the driver)
- Remove OSDs from PG history during `vitastor-cli rm-osd` to prevent `left_on_dead` PG states after deletion
- Add a new recovery_pg_switch setting to mix all PGs during recovery, to almost
  fully reduce the probability of ENOSPC during rebalance
- Introduce partial ENOSPC ("OSD is full") handling - now ENOSPC doesn't turn
  into cascades of crashes
- Add migration support to Proxmox VE Vitastor driver
- Track last_clean_pgs on a per-pool basis thus reducing data movement in a cluster
  with pools remaining unclean/degraded for a long time

Bug fixes:
- Fix a bug where monitor could generate degraded PGs if one of the hosts had no OSDs
- Fix a bug where monitor could skip PG redistribution with a lot of OSDs in cluster
- Report PG history synchronously on the first write, which improves PG consistency
  and availability at the same time, because history now gets reported correctly
  and doesn't get reported without the need for it
- Fix possible write and recovery stalls which could happen in a cluster with both EC and replicated pools
- Make OSD and monitors sanitize & deduplicate PG history items in etcd
- Fix non-working OSD peer config safety check
- Fix a rare journal flush stall where flushing wasn't activated with full journal, but with empty flush queue
- Fix builds without ISA-L (jerasure-only) crashing with EC N+K, K>=2 due to the lack of 16-byte buffer alignment
- Fix a possible crash for EC N+K, K>=2 when calculating a parity chunk with previous parity chunk missing
- Fix a bug where vitastor-disk purge with suppressed warnings didn't work
2023-01-13 23:59:54 +03:00
bc465c16de Fix arithmetic on void* for clang 2023-01-13 23:58:42 +03:00
8763e9211c Fix qemu driver compilation warning/error 2023-01-13 23:44:39 +03:00
9e1a80bd17 Replace apt-key with trusted.gpg.d 2023-01-13 19:51:47 +03:00
3e280f2f08 Mark vitastor as shared storage in PVE driver 2023-01-13 01:36:30 +03:00
fe87b4076b Fix backwards compatibility in cluster_client 2023-01-12 02:37:31 +03:00
a38957c1a7 Skip empty hosts in lp-optimizer 2023-01-09 16:26:16 +03:00
137309cf29 Implement bdrv_co_block_status for snapshot export support 2023-01-07 17:06:58 +03:00
33 changed files with 411 additions and 361 deletions

View File

@@ -2,6 +2,6 @@ cmake_minimum_required(VERSION 2.8)
project(vitastor)
set(VERSION "0.8.3")
set(VERSION "0.8.4")
add_subdirectory(src)

View File

@@ -1,4 +1,4 @@
VERSION ?= v0.8.3
VERSION ?= v0.8.4
all: build push

View File

@@ -49,7 +49,7 @@ spec:
capabilities:
add: ["SYS_ADMIN"]
allowPrivilegeEscalation: true
image: vitalif/vitastor-csi:v0.8.3
image: vitalif/vitastor-csi:v0.8.4
args:
- "--node=$(NODE_ID)"
- "--endpoint=$(CSI_ENDPOINT)"

View File

@@ -116,7 +116,7 @@ spec:
privileged: true
capabilities:
add: ["SYS_ADMIN"]
image: vitalif/vitastor-csi:v0.8.3
image: vitalif/vitastor-csi:v0.8.4
args:
- "--node=$(NODE_ID)"
- "--endpoint=$(CSI_ENDPOINT)"

View File

@@ -5,7 +5,7 @@ package vitastor
const (
vitastorCSIDriverName = "csi.vitastor.io"
vitastorCSIDriverVersion = "0.8.3"
vitastorCSIDriverVersion = "0.8.4"
)
// Config struct fills the parameters of request or user input

4
debian/changelog vendored
View File

@@ -1,10 +1,10 @@
vitastor (0.8.3-1) unstable; urgency=medium
vitastor (0.8.4-1) unstable; urgency=medium
* Bugfixes
-- Vitaliy Filippov <vitalif@yourcmc.ru> Fri, 03 Jun 2022 02:09:44 +0300
vitastor (0.8.3-1) unstable; urgency=medium
vitastor (0.8.4-1) unstable; urgency=medium
* Implement NFS proxy
* Add documentation

View File

@@ -34,8 +34,8 @@ RUN set -e -x; \
mkdir -p /root/packages/vitastor-$REL; \
rm -rf /root/packages/vitastor-$REL/*; \
cd /root/packages/vitastor-$REL; \
cp -r /root/vitastor vitastor-0.8.3; \
cd vitastor-0.8.3; \
cp -r /root/vitastor vitastor-0.8.4; \
cd vitastor-0.8.4; \
ln -s /root/fio-build/fio-*/ ./fio; \
FIO=$(head -n1 fio/debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
ls /usr/include/linux/raw.h || cp ./debian/raw.h /usr/include/linux/raw.h; \
@@ -48,8 +48,8 @@ RUN set -e -x; \
rm -rf a b; \
echo "dep:fio=$FIO" > debian/fio_version; \
cd /root/packages/vitastor-$REL; \
tar --sort=name --mtime='2020-01-01' --owner=0 --group=0 --exclude=debian -cJf vitastor_0.8.3.orig.tar.xz vitastor-0.8.3; \
cd vitastor-0.8.3; \
tar --sort=name --mtime='2020-01-01' --owner=0 --group=0 --exclude=debian -cJf vitastor_0.8.4.orig.tar.xz vitastor-0.8.4; \
cd vitastor-0.8.4; \
V=$(head -n1 debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
DEBFULLNAME="Vitaliy Filippov <vitalif@yourcmc.ru>" dch -D $REL -v "$V""$REL" "Rebuild for $REL"; \
DEB_BUILD_OPTIONS=nocheck dpkg-buildpackage --jobs=auto -sa; \

View File

@@ -9,7 +9,7 @@
## Debian
- Trust Vitastor package signing key:
`wget -q -O - https://vitastor.io/debian/pubkey | sudo apt-key add -`
`wget https://vitastor.io/debian/pubkey.gpg -O /etc/apt/trusted.gpg.d/vitastor.gpg`
- Add Vitastor package repository to your /etc/apt/sources.list:
- Debian 11 (Bullseye/Sid): `deb https://vitastor.io/debian bullseye main`
- Debian 10 (Buster): `deb https://vitastor.io/debian buster main`

View File

@@ -9,7 +9,7 @@
## Debian
- Добавьте ключ репозитория Vitastor:
`wget -q -O - https://vitastor.io/debian/pubkey | sudo apt-key add -`
`wget https://vitastor.io/debian/pubkey.gpg -O /etc/apt/trusted.gpg.d/vitastor.gpg`
- Добавьте репозиторий Vitastor в /etc/apt/sources.list:
- Debian 11 (Bullseye/Sid): `deb https://vitastor.io/debian bullseye main`
- Debian 10 (Buster): `deb https://vitastor.io/debian buster main`

View File

@@ -14,6 +14,7 @@ It supports the following commands:
- [df](#df)
- [ls](#ls)
- [create](#create)
- [snap-create](#create)
- [modify](#modify)
- [rm](#rm)
- [flatten](#flatten)
@@ -123,6 +124,8 @@ vitastor-cli snap-create [-p|--pool <id|name>] <image>@<snapshot>
Create a snapshot of image `<name>` (either form can be used). May be used live if only a single writer is active.
See also about [how to export snapshots](qemu.en.md#exporting-snapshots).
## modify
`vitastor-cli modify <name> [--rename <new-name>] [--resize <size>] [--readonly | --readwrite] [-f|--force]`

View File

@@ -15,6 +15,7 @@ vitastor-cli - интерфейс командной строки для адм
- [df](#df)
- [ls](#ls)
- [create](#create)
- [snap-create](#create)
- [modify](#modify)
- [rm](#rm)
- [flatten](#flatten)
@@ -126,6 +127,8 @@ vitastor-cli snap-create [-p|--pool <id|name>] <image>@<snapshot>
Создать снимок образа `<name>` (можно использовать любую форму команды). Снимок можно создавать без остановки
клиентов, если пишущий клиент максимум 1.
Смотрите также информацию о том, [как экспортировать снимки](qemu.ru.md#экспорт-снимков).
## modify
`vitastor-cli modify <name> [--rename <new-name>] [--resize <size>] [--readonly | --readwrite] [-f|--force]`

View File

@@ -46,3 +46,40 @@ qemu-img convert -f qcow2 debian10.qcow2 -p -O raw 'vitastor:etcd_host=192.168.7
You can also specify `:pool=<POOL>:inode=<INODE>:size=<SIZE>` instead of `:image=<IMAGE>`
if you don't want to use inode metadata.
### Exporting snapshots
Starting with 0.8.4, you can also export individual layers (snapshot diffs) using `qemu-img`.
Suppose you have an image `testimg` and a snapshot `testimg@0` created with `vitastor-cli snap-create testimg@0`.
Then you can export the `testimg@0` snapshot and the data written to `testimg` after creating
the snapshot separately using the following commands (key points are using `skip-parents=1` and
`-B backing_file` option):
```
qemu-img convert -f raw 'vitastor:etcd_host=192.168.7.2\:2379/v3:image=testimg@0' \
-O qcow2 testimg_0.qcow2
qemu-img convert -f raw 'vitastor:etcd_host=192.168.7.2\:2379/v3:image=testimg:skip-parents=1' \
-O qcow2 -o 'cluster_size=4k' -B testimg_0.qcow2 testimg.qcow2
```
In fact, with `cluster_size=4k` any QCOW2 file can be used instead `-B testimg_0.qcow2`, even an empty one.
QCOW2 `cluster_size=4k` option is required if you want `testimg.qcow2` to contain only the data
overwritten **exactly** in the child layer. With the default 64 KB QCOW2 cluster size you'll
get a bit of extra data from parent layers, i.e. a 4 KB overwrite will result in `testimg.qcow2`
containing 64 KB of data. And this extra data will be taken by `qemu-img` from the file passed
in `-B` option, so you really need 4 KB cluster if you use an empty image in `-B`.
After this procedure you'll get two chained QCOW2 images. To detach `testimg.qcow2` from
its parent, run:
```
qemu-img rebase -u -b '' testimg.qcow2
```
This can be used for backups. Just note that exporting an image that is currently being written to
is of course unsafe and doesn't produce a consistent result, so only export snapshots if you do this
on a live VM.

View File

@@ -50,3 +50,40 @@ qemu-img convert -f qcow2 debian10.qcow2 -p -O raw 'vitastor:etcd_host=10.115.0.
Если вы не хотите обращаться к образу по имени, вместо `:image=<IMAGE>` можно указать номер пула, номер инода и размер:
`:pool=<POOL>:inode=<INODE>:size=<SIZE>`.
### Экспорт снимков
Начиная с 0.8.4 вы можете экспортировать отдельные слои (изменения в снимках) с помощью `qemu-img`.
Допустим, что у вас есть образ `testimg` и его снимок `testimg@0`, созданный с помощью `vitastor-cli snap-create testimg@0`.
Тогда вы можете выгрузить снимок `testimg@0` и данные, изменённые в `testimg` после создания снимка, отдельно,
с помощью следующих команд (ключевые моменты - использование `skip-parents=1` и опции `-B backing_file.qcow2`):
```
qemu-img convert -f raw 'vitastor:etcd_host=192.168.7.2\:2379/v3:image=testimg@0' \
-O qcow2 testimg_0.qcow2
qemu-img convert -f raw 'vitastor:etcd_host=192.168.7.2\:2379/v3:image=testimg:skip-parents=1' \
-O qcow2 -o 'cluster_size=4k' -B testimg_0.qcow2 testimg.qcow2
```
На самом деле, с `cluster_size=4k` вместо `-B testimg_0.qcow2` можно использовать любой qcow2-файл,
даже пустой.
Опция QCOW2 `cluster_size=4k` нужна, если вы хотите, чтобы `testimg.qcow2` содержал **в точности**
данные, перезаписанные в дочернем слое. С размером кластера QCOW2 по умолчанию, составляющим 64 КБ,
вы получите немного "лишних" данных из родительских слоёв - перезапись 4 КБ будет приводить к тому,
что в `testimg.qcow2` будет появляться 64 КБ данных. Причём "лишние" данные qemu-img будет брать
как раз из файла, указанного в опции `-B`, так что если там указан пустой образ, кластер обязан быть 4 КБ.
После данной процедуры вы получите два QCOW2-образа, связанных в цепочку. Чтобы "отцепить" образ
`testimg.qcow2` от базового, выполните:
```
qemu-img rebase -u -b '' testimg.qcow2
```
Это можно использовать для резервного копирования. Только помните, что экспортировать образ, в который
в то же время идёт запись, небезопасно - результат чтения не будет целостным. Так что если вы работаете
с активными виртуальными машинами, экспортируйте только их снимки, но не сам образ.

View File

@@ -550,8 +550,8 @@ function random_combinations(osd_tree, pg_size, count, ordered)
seed ^= seed << 5;
return seed + 2147483648;
};
const hosts = Object.keys(osd_tree).sort();
const osds = Object.keys(osd_tree).reduce((a, c) => { a[c] = Object.keys(osd_tree[c]).sort(); return a; }, {});
const hosts = Object.keys(osd_tree).sort().filter(h => osds[h].length > 0);
const r = {};
// Generate random combinations including each OSD at least once
for (let h = 0; h < hosts.length; h++)

View File

@@ -16,6 +16,11 @@ use PVE::Tools qw(run_command);
use base qw(PVE::Storage::Plugin);
if (@PVE::Storage::Plugin::SHARED_STORAGE)
{
push @PVE::Storage::Plugin::SHARED_STORAGE, 'vitastor';
}
sub api
{
# Trick it :)
@@ -133,9 +138,10 @@ sub properties
sub options
{
return {
shared => { optional => 1 },
nodes => { optional => 1 },
disable => { optional => 1 },
vitastor_etcd_address => { optional => 1},
vitastor_etcd_address => { optional => 1 },
vitastor_etcd_prefix => { optional => 1 },
vitastor_config_path => { optional => 1 },
vitastor_prefix => { optional => 1 },

View File

@@ -50,7 +50,7 @@ from cinder.volume import configuration
from cinder.volume import driver
from cinder.volume import volume_utils
VERSION = '0.8.3'
VERSION = '0.8.4'
LOG = logging.getLogger(__name__)

View File

@@ -25,4 +25,4 @@ rm fio
mv fio-copy fio
FIO=`rpm -qi fio | perl -e 'while(<>) { /^Epoch[\s:]+(\S+)/ && print "$1:"; /^Version[\s:]+(\S+)/ && print $1; /^Release[\s:]+(\S+)/ && print "-$1"; }'`
perl -i -pe 's/(Requires:\s*fio)([^\n]+)?/$1 = '$FIO'/' $VITASTOR/rpm/vitastor-el$EL.spec
tar --transform 's#^#vitastor-0.8.3/#' --exclude 'rpm/*.rpm' -czf $VITASTOR/../vitastor-0.8.3$(rpm --eval '%dist').tar.gz *
tar --transform 's#^#vitastor-0.8.4/#' --exclude 'rpm/*.rpm' -czf $VITASTOR/../vitastor-0.8.4$(rpm --eval '%dist').tar.gz *

View File

@@ -35,7 +35,7 @@ ADD . /root/vitastor
RUN set -e; \
cd /root/vitastor/rpm; \
sh build-tarball.sh; \
cp /root/vitastor-0.8.3.el7.tar.gz ~/rpmbuild/SOURCES; \
cp /root/vitastor-0.8.4.el7.tar.gz ~/rpmbuild/SOURCES; \
cp vitastor-el7.spec ~/rpmbuild/SPECS/vitastor.spec; \
cd ~/rpmbuild/SPECS/; \
rpmbuild -ba vitastor.spec; \

View File

@@ -1,11 +1,11 @@
Name: vitastor
Version: 0.8.3
Version: 0.8.4
Release: 1%{?dist}
Summary: Vitastor, a fast software-defined clustered block storage
License: Vitastor Network Public License 1.1
URL: https://vitastor.io/
Source0: vitastor-0.8.3.el7.tar.gz
Source0: vitastor-0.8.4.el7.tar.gz
BuildRequires: liburing-devel >= 0.6
BuildRequires: gperftools-devel

View File

@@ -35,7 +35,7 @@ ADD . /root/vitastor
RUN set -e; \
cd /root/vitastor/rpm; \
sh build-tarball.sh; \
cp /root/vitastor-0.8.3.el8.tar.gz ~/rpmbuild/SOURCES; \
cp /root/vitastor-0.8.4.el8.tar.gz ~/rpmbuild/SOURCES; \
cp vitastor-el8.spec ~/rpmbuild/SPECS/vitastor.spec; \
cd ~/rpmbuild/SPECS/; \
rpmbuild -ba vitastor.spec; \

View File

@@ -1,11 +1,11 @@
Name: vitastor
Version: 0.8.3
Version: 0.8.4
Release: 1%{?dist}
Summary: Vitastor, a fast software-defined clustered block storage
License: Vitastor Network Public License 1.1
URL: https://vitastor.io/
Source0: vitastor-0.8.3.el8.tar.gz
Source0: vitastor-0.8.4.el8.tar.gz
BuildRequires: liburing-devel >= 0.6
BuildRequires: gperftools-devel

View File

@@ -15,7 +15,7 @@ if("${CMAKE_INSTALL_PREFIX}" MATCHES "^/usr/local/?$")
set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}")
endif()
add_definitions(-DVERSION="0.8.3")
add_definitions(-DVERSION="0.8.4")
add_definitions(-Wall -Wno-sign-compare -Wno-comment -Wno-parentheses -Wno-pointer-arith -fdiagnostics-color=always -I ${CMAKE_SOURCE_DIR}/src)
if (${WITH_ASAN})
add_definitions(-fsanitize=address -fno-omit-frame-pointer)

View File

@@ -403,7 +403,7 @@ struct snap_merger_t
op->opcode = OSD_OP_READ_BITMAP;
op->inode = target;
op->offset = offset;
op->len = 0;
op->len = target_block_size;
op->callback = [this](cluster_op_t *op)
{
if (op->retval < 0)

View File

@@ -143,7 +143,7 @@ void cluster_client_t::calc_wait(cluster_op_t *op)
if (!op->prev_wait)
continue_sync(op);
}
else /* if (op->opcode == OSD_OP_READ || op->opcode == OSD_OP_READ_BITMAP) */
else /* if (op->opcode == OSD_OP_READ || op->opcode == OSD_OP_READ_BITMAP || op->opcode == OSD_OP_READ_CHAIN_BITMAP) */
{
for (auto prev = op_queue_head; prev && prev != op; prev = prev->next)
{
@@ -151,7 +151,8 @@ void cluster_client_t::calc_wait(cluster_op_t *op)
{
op->prev_wait++;
}
else if (prev->opcode == OSD_OP_WRITE || prev->opcode == OSD_OP_READ || prev->opcode == OSD_OP_READ_BITMAP)
else if (prev->opcode == OSD_OP_WRITE || prev->opcode == OSD_OP_READ ||
prev->opcode == OSD_OP_READ_BITMAP || prev->opcode == OSD_OP_READ_CHAIN_BITMAP)
{
// Flushes are always in the beginning (we're scanning from the beginning of the queue)
break;
@@ -171,7 +172,8 @@ void cluster_client_t::inc_wait(uint64_t opcode, uint64_t flags, cluster_op_t *n
auto n2 = next->next;
if (next->opcode == OSD_OP_SYNC && !(flags & OP_IMMEDIATE_COMMIT) ||
next->opcode == OSD_OP_WRITE && (flags & OP_FLUSH_BUFFER) && !(next->flags & OP_FLUSH_BUFFER) ||
(next->opcode == OSD_OP_READ || next->opcode == OSD_OP_READ_BITMAP) && (flags & OP_FLUSH_BUFFER))
(next->opcode == OSD_OP_READ || next->opcode == OSD_OP_READ_BITMAP ||
next->opcode == OSD_OP_READ_CHAIN_BITMAP) && (flags & OP_FLUSH_BUFFER))
{
next->prev_wait += inc;
assert(next->prev_wait >= 0);
@@ -337,7 +339,8 @@ void cluster_client_t::on_change_hook(std::map<std::string, etcd_kv_t> & changes
// And now they have to be resliced!
for (auto op = op_queue_head; op; op = op->next)
{
if ((op->opcode == OSD_OP_WRITE || op->opcode == OSD_OP_READ || op->opcode == OSD_OP_READ_BITMAP) &&
if ((op->opcode == OSD_OP_WRITE || op->opcode == OSD_OP_READ ||
op->opcode == OSD_OP_READ_BITMAP || op->opcode == OSD_OP_READ_CHAIN_BITMAP) &&
INODE_POOL(op->cur_inode) == pool_item.first)
{
op->needs_reslice = true;
@@ -409,7 +412,7 @@ void cluster_client_t::on_ready(std::function<void(void)> fn)
void cluster_client_t::execute(cluster_op_t *op)
{
if (op->opcode != OSD_OP_SYNC && op->opcode != OSD_OP_READ &&
op->opcode != OSD_OP_READ_BITMAP && op->opcode != OSD_OP_WRITE)
op->opcode != OSD_OP_READ_BITMAP && op->opcode != OSD_OP_READ_CHAIN_BITMAP && op->opcode != OSD_OP_WRITE)
{
op->retval = -EINVAL;
std::function<void(cluster_op_t*)>(op->callback)(op);
@@ -441,7 +444,7 @@ void cluster_client_t::execute(cluster_op_t *op)
return;
}
// Check alignment
if ((op->opcode == OSD_OP_READ || op->opcode == OSD_OP_WRITE) && !op->len ||
if (!op->len && (op->opcode == OSD_OP_READ || op->opcode == OSD_OP_READ_BITMAP || op->opcode == OSD_OP_READ_CHAIN_BITMAP || op->opcode == OSD_OP_WRITE) ||
op->offset % pool_it->second.bitmap_granularity || op->len % pool_it->second.bitmap_granularity)
{
op->retval = -EINVAL;
@@ -702,8 +705,7 @@ resume_3:
// Finished successfully
// Even if the PG count has changed in meanwhile we treat it as success
// because if some operations were invalid for the new PG count we'd get errors
bool is_read = op->opcode == OSD_OP_READ;
if (is_read)
if (op->opcode == OSD_OP_READ || op->opcode == OSD_OP_READ_CHAIN_BITMAP)
{
// Check parent inode
auto ino_it = st_cli.inode_config.find(op->cur_inode);
@@ -727,6 +729,11 @@ resume_3:
}
}
op->retval = op->len;
if (op->opcode == OSD_OP_READ_BITMAP || op->opcode == OSD_OP_READ_CHAIN_BITMAP)
{
auto & pool_cfg = st_cli.pool_config.at(INODE_POOL(op->inode));
op->retval = op->len / pool_cfg.bitmap_granularity;
}
erase_op(op);
return 1;
}
@@ -809,23 +816,19 @@ void cluster_client_t::slice_rw(cluster_op_t *op)
uint64_t last_stripe = op->len > 0 ? ((op->offset + op->len - 1) / pg_block_size) * pg_block_size : first_stripe;
op->retval = 0;
op->parts.resize((last_stripe - first_stripe) / pg_block_size + 1);
if (op->opcode == OSD_OP_READ || op->opcode == OSD_OP_READ_BITMAP)
if (op->opcode == OSD_OP_READ || op->opcode == OSD_OP_READ_BITMAP || op->opcode == OSD_OP_READ_CHAIN_BITMAP)
{
// Allocate memory for the bitmap
unsigned object_bitmap_size = (((op->opcode == OSD_OP_READ_BITMAP ? pg_block_size : op->len) / pool_cfg.bitmap_granularity + 7) / 8);
unsigned object_bitmap_size = ((op->len / pool_cfg.bitmap_granularity + 7) / 8);
object_bitmap_size = (object_bitmap_size < 8 ? 8 : object_bitmap_size);
unsigned bitmap_mem = object_bitmap_size + (pool_cfg.data_block_size / pool_cfg.bitmap_granularity / 8 * pg_data_size) * op->parts.size();
if (op->bitmap_buf_size < bitmap_mem)
if (!op->bitmap_buf || op->bitmap_buf_size < bitmap_mem)
{
op->bitmap_buf = realloc_or_die(op->bitmap_buf, bitmap_mem);
if (!op->bitmap_buf_size)
{
// First allocation
memset(op->bitmap_buf, 0, object_bitmap_size);
}
op->part_bitmaps = (uint8_t*)op->bitmap_buf + object_bitmap_size;
op->bitmap_buf_size = bitmap_mem;
}
memset(op->bitmap_buf, 0, bitmap_mem);
}
int iov_idx = 0;
size_t iov_pos = 0;
@@ -876,13 +879,14 @@ void cluster_client_t::slice_rw(cluster_op_t *op)
if (end == begin)
op->done_count++;
}
else if (op->opcode != OSD_OP_READ_BITMAP && op->opcode != OSD_OP_DELETE)
else if (op->opcode != OSD_OP_READ_BITMAP && op->opcode != OSD_OP_READ_CHAIN_BITMAP && op->opcode != OSD_OP_DELETE)
{
add_iov(end-begin, false, op, iov_idx, iov_pos, op->parts[i].iov, NULL, 0);
}
op->parts[i].parent = op;
op->parts[i].offset = begin;
op->parts[i].len = op->opcode == OSD_OP_READ_BITMAP || op->opcode == OSD_OP_DELETE ? 0 : (uint32_t)(end - begin);
op->parts[i].len = op->opcode == OSD_OP_READ_BITMAP || op->opcode == OSD_OP_READ_CHAIN_BITMAP ||
op->opcode == OSD_OP_DELETE ? 0 : (uint32_t)(end - begin);
op->parts[i].pg_num = pg_num;
op->parts[i].osd_num = 0;
op->parts[i].flags = 0;
@@ -929,7 +933,7 @@ bool cluster_client_t::try_send(cluster_op_t *op, int i)
pool_cfg.scheme == POOL_SCHEME_REPLICATED ? 1 : pool_cfg.pg_size-pool_cfg.parity_chunks
);
uint64_t meta_rev = 0;
if (op->opcode != OSD_OP_READ_BITMAP && op->opcode != OSD_OP_DELETE)
if (op->opcode != OSD_OP_READ_BITMAP && op->opcode != OSD_OP_READ_CHAIN_BITMAP && op->opcode != OSD_OP_DELETE)
{
auto ino_it = st_cli.inode_config.find(op->inode);
if (ino_it != st_cli.inode_config.end())
@@ -942,7 +946,7 @@ bool cluster_client_t::try_send(cluster_op_t *op, int i)
.header = {
.magic = SECONDARY_OSD_OP_MAGIC,
.id = next_op_id(),
.opcode = op->opcode == OSD_OP_READ_BITMAP ? OSD_OP_READ : op->opcode,
.opcode = op->opcode == OSD_OP_READ_BITMAP || op->opcode == OSD_OP_READ_CHAIN_BITMAP ? OSD_OP_READ : op->opcode,
},
.inode = op->cur_inode,
.offset = part->offset,
@@ -950,8 +954,10 @@ bool cluster_client_t::try_send(cluster_op_t *op, int i)
.meta_revision = meta_rev,
.version = op->opcode == OSD_OP_WRITE || op->opcode == OSD_OP_DELETE ? op->version : 0,
} },
.bitmap = (op->opcode == OSD_OP_READ || op->opcode == OSD_OP_READ_BITMAP ? (uint8_t*)op->part_bitmaps + pg_bitmap_size*i : NULL),
.bitmap_len = (unsigned)(op->opcode == OSD_OP_READ || op->opcode == OSD_OP_READ_BITMAP ? pg_bitmap_size : 0),
.bitmap = (op->opcode == OSD_OP_READ || op->opcode == OSD_OP_READ_BITMAP || op->opcode == OSD_OP_READ_CHAIN_BITMAP
? (uint8_t*)op->part_bitmaps + pg_bitmap_size*i : NULL),
.bitmap_len = (unsigned)(op->opcode == OSD_OP_READ || op->opcode == OSD_OP_READ_BITMAP || op->opcode == OSD_OP_READ_CHAIN_BITMAP
? pg_bitmap_size : 0),
.callback = [this, part](osd_op_t *op_part)
{
handle_op_part(part);
@@ -1130,11 +1136,11 @@ void cluster_client_t::handle_op_part(cluster_op_part_t *part)
else
{
// OK
if (!(op->flags & OP_IMMEDIATE_COMMIT))
if ((op->opcode == OSD_OP_WRITE || op->opcode == OSD_OP_DELETE) && !(op->flags & OP_IMMEDIATE_COMMIT))
dirty_osds.insert(part->osd_num);
part->flags |= PART_DONE;
op->done_count++;
if (op->opcode == OSD_OP_READ || op->opcode == OSD_OP_READ_BITMAP)
if (op->opcode == OSD_OP_READ || op->opcode == OSD_OP_READ_BITMAP || op->opcode == OSD_OP_READ_CHAIN_BITMAP)
{
copy_part_bitmap(op, part);
op->version = op->parts.size() == 1 ? part->op.reply.rw.version : 0;
@@ -1158,7 +1164,12 @@ void cluster_client_t::copy_part_bitmap(cluster_op_t *op, cluster_op_part_t *par
);
uint32_t object_offset = (part->op.req.rw.offset - op->offset) / pool_cfg.bitmap_granularity;
uint32_t part_offset = (part->op.req.rw.offset % pg_block_size) / pool_cfg.bitmap_granularity;
uint32_t part_len = (op->opcode == OSD_OP_READ_BITMAP ? pg_block_size : part->op.req.rw.len) / pool_cfg.bitmap_granularity;
uint32_t op_len = op->len / pool_cfg.bitmap_granularity;
uint32_t part_len = pg_block_size/pool_cfg.bitmap_granularity - part_offset;
if (part_len > op_len-object_offset)
{
part_len = op_len-object_offset;
}
if (!(object_offset & 0x7) && !(part_offset & 0x7) && (part_len >= 8))
{
// Copy bytes

View File

@@ -11,6 +11,7 @@
#define INODE_LIST_DONE 1
#define INODE_LIST_HAS_UNSTABLE 2
#define OSD_OP_READ_BITMAP OSD_OP_SEC_READ_BMP
#define OSD_OP_READ_CHAIN_BITMAP 0x102
#define OSD_OP_IGNORE_READONLY 0x08
@@ -30,7 +31,7 @@ struct cluster_op_part_t
struct cluster_op_t
{
uint64_t opcode; // OSD_OP_READ, OSD_OP_WRITE, OSD_OP_SYNC, OSD_OP_DELETE, OSD_OP_READ_BITMAP
uint64_t opcode; // OSD_OP_READ, OSD_OP_WRITE, OSD_OP_SYNC, OSD_OP_DELETE, OSD_OP_READ_BITMAP, OSD_OP_READ_CHAIN_BITMAP
uint64_t inode;
uint64_t offset;
uint64_t len;
@@ -39,9 +40,13 @@ struct cluster_op_t
uint64_t version = 0;
// now only OSD_OP_IGNORE_READONLY is supported
uint64_t flags = 0;
// negative retval is an error number
// write and read return len on success
// sync and delete return 0 on success
// read_bitmap and read_chain_bitmap return the length of bitmap in bits(!)
int retval;
osd_op_buf_list_t iov;
// READ and READ_BITMAP return the bitmap here
// READ, READ_BITMAP, READ_CHAIN_BITMAP return the bitmap here
void *bitmap_buf = NULL;
std::function<void(cluster_op_t*)> callback;
~cluster_op_t();

View File

@@ -1,2 +0,0 @@
mat: mat.c
gcc -O3 -I/usr/include/jerasure -o mat mat.c -lJerasure

View File

@@ -1,291 +0,0 @@
#include <jerasure/reed_sol.h>
#include <jerasure.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <assert.h>
// Generate LRC matrix: (groups*local + global) code rows with (data_drives) columns
// w should be >= log2(data_drives + groups*local + global), but not necessary 8/16/32
int* reed_sol_vandermonde_lrc_matrix(int data_drives, int groups, int local, int global, int w)
{
if (w < 0 || w > 32 || data_drives + groups*local + global > (1<<w))
{
return NULL;
}
int *lrc_matrix = (int*)malloc(sizeof(int) * (local*groups+global));
int *matrix = reed_sol_vandermonde_coding_matrix(data_drives, local+global, w);
// Enough to transform LRC 8+2+2 GF(8) matrix into MR-LRC
//for (int i = 0; i < local+global; i++)
//{
// int t = matrix[i*data_drives + 3];
// matrix[i*data_drives + 3] = matrix[i*data_drives + 7];
// matrix[i*data_drives + 7] = t;
//}
for (int gr = 0; gr < groups; gr++)
{
for (int l = 0; l < local; l++)
{
for (int j = 0; j < data_drives; j++)
{
lrc_matrix[(gr*local+l)*data_drives + j] = (j / (data_drives/groups)) == gr ? matrix[l*data_drives + j] : 0;
}
}
}
for (int i = 0; i < global; i++)
{
for (int j = 0; j < data_drives; j++)
{
lrc_matrix[(groups*local+i)*data_drives + j] = matrix[(local+i)*data_drives + j];
}
}
free(matrix);
return lrc_matrix;
}
struct lrc_test_result_t
{
int success, impossible, failures;
};
// Check if the generated LRC with given parameters is Maximally Reconstructible (MR-LRC)
// Example of a MR-LRC: (8, 2, 1, 2, 6, 8)
struct lrc_test_result_t check_mr_lrc(int *lrc_matrix, int data_drives, int groups, int local, int global, int w, int log_level)
{
int n = data_drives;
int total_rows = n + groups*local + global;
int impossible = 0, success = 0, failures = 0;
int *lost_per_group = (int*)malloc(sizeof(int) * groups);
int *recovered_per_group = (int*)malloc(sizeof(int) * groups);
int *selected_inverted = (int*)malloc(sizeof(int) * data_drives);
// global+1 is always recoverable
for (int lost = global+2; lost <= groups*local+global; lost++)
{
int *erased_matrix = (int*)malloc(sizeof(int) * (total_rows-lost)*n);
int *inverted_matrix = (int*)malloc(sizeof(int) * (total_rows-lost)*n);
int *p = (int*)malloc(sizeof(int) * (total_rows-lost));
for (int i = 0; i < n; i++)
p[i] = i;
int *p2 = (int*)malloc(sizeof(int) * n);
if (total_rows-lost > n)
{
p[n-1] = n; // skip combinations with all N data disks (0..n-1)
for (int i = n; i < total_rows-lost; i++)
p[i] = i+1;
p[total_rows-lost-1]--; // will be incremented on the first step
}
int inc = total_rows-lost-1;
while (1)
{
p[inc]++;
if (p[inc] >= n+groups*local+global)
{
if (inc == 0)
break;
inc--;
}
else if (inc+1 < total_rows-lost)
{
p[inc+1] = p[inc];
inc++;
}
else
{
// Check if it should be recoverable
// Calculate count of data chunks lost in each group
int nsel = 0;
for (int gr = 0; gr < groups; gr++)
{
lost_per_group[gr] = ((gr+1)*(n/groups) > n ? (n - gr*(n/groups)) : n/groups);
recovered_per_group[gr] = 0;
}
for (int j = 0; j < total_rows-lost; j++)
{
if (p[j] < n)
{
lost_per_group[(p[j] / (n/groups))]--;
selected_inverted[nsel++] = j;
}
}
// Every local parity chunk is supposed to restore 1 missing chunk inside its group
// So, subtract local parity chunk counts from each group lost chunk count
for (int j = 0; j < total_rows-lost; j++)
{
if (p[j] >= n && p[j] < n+groups*local)
{
int gr = (p[j]-n)/local;
if (lost_per_group[gr] > recovered_per_group[gr] && nsel < n)
{
selected_inverted[nsel++] = j;
}
recovered_per_group[gr]++;
}
}
// Every global parity chunk is supposed to restore 1 chunk of all that are still missing
int still_missing = 0;
for (int gr = 0; gr < groups; gr++)
{
int non_fixed = lost_per_group[gr] - recovered_per_group[gr];
still_missing += (non_fixed > 0 ? non_fixed : 0);
}
for (int j = 0; j < total_rows-lost; j++)
{
if (p[j] >= n+groups*local)
{
if (still_missing > 0 && nsel < n)
{
selected_inverted[nsel++] = j;
}
still_missing--;
}
}
if (still_missing <= 0)
{
// We hope it can be recoverable. Try to invert it
assert(nsel == n);
for (int i = 0; i < n; i++)
{
for (int j = 0; j < n; j++)
{
erased_matrix[i*n+j] = lrc_matrix[p[selected_inverted[i]]*n+j];
}
}
int invert_ok = jerasure_invert_matrix(erased_matrix, inverted_matrix, n, w);
if (invert_ok < 0)
{
failures++;
if (log_level > 0)
{
printf("\nFAIL: ");
for (int i = 0; i < total_rows-lost; i++)
{
printf("%d ", p[i]);
}
printf("\nDIRECT:\n");
for (int i = 0; i < total_rows-lost; i++)
{
for (int j = 0; j < n; j++)
printf("%d ", lrc_matrix[p[i]*n+j]);
printf("\n");
}
printf("INVERSE:\n");
for (int i = 0; i < total_rows-lost; i++)
{
for (int j = 0; j < n; j++)
printf("%d ", inverted_matrix[i*n+j]);
printf("\n");
}
}
}
else
{
success++;
if (log_level > 2)
{
printf("OK: ");
for (int i = 0; i < total_rows-lost; i++)
{
printf("%d ", p[i]);
}
printf("\n");
}
}
}
else
{
impossible++;
if (log_level > 1)
{
printf("IMPOSSIBLE: ");
for (int i = 0; i < total_rows-lost; i++)
{
printf("%d ", p[i]);
}
printf("\n");
}
}
}
}
free(p2);
free(p);
free(inverted_matrix);
free(erased_matrix);
}
free(lost_per_group);
free(recovered_per_group);
return (struct lrc_test_result_t){
.success = success,
.impossible = impossible,
.failures = failures,
};
}
int main()
{
int W = 8, MATRIX_W = 8;
int n = 8, groups = 2, local = 1, global = 2;
//n = 4, groups = 2, local = 1, global = 1;
int total_rows = n+groups*local+global;
int *matrix = reed_sol_vandermonde_lrc_matrix(n, groups, local, global, MATRIX_W);
int *lrc_matrix = (int*)malloc(sizeof(int) * total_rows*n);
// Fill identity+LRC matrix
for (int i = 0; i < n; i++)
for (int j = 0; j < n; j++)
lrc_matrix[i*n + j] = j == i ? 1 : 0;
memcpy(lrc_matrix + n*n, matrix, (total_rows-n)*n*sizeof(int));
free(matrix);
matrix = NULL;
// Print LRC matrix
for (int i = 0; i < total_rows; i++)
{
for (int j = 0; j < n; j++)
{
printf("%d ", lrc_matrix[i*n+j]);
}
printf("\n");
}
struct lrc_test_result_t t = check_mr_lrc(lrc_matrix, n, groups, local, global, W, 1);
printf("\n%d recovered, %d impossible, %d failures\n", t.success, t.impossible, t.failures);
return 0;
}
// 1 1 1 1 0 0 0 0
// 0 0 0 0 1 1 1 1
// 1 55 39 73 84 181 225 217
// 1 172 70 235 143 34 200 101
//
// Can't recover
// 1 2 4 5 8 9 10 11 -1
// 2 3 4 6 8 9 10 11 -1
// FULL:
// 1 0 0 0 0 0 0 0
// 0 1 0 0 0 0 0 0
// 0 0 1 0 0 0 0 0
// 0 0 0 1 0 0 0 0
// 0 0 0 0 1 0 0 0
// 0 0 0 0 0 1 0 0
// 0 0 0 0 0 0 1 0
// 0 0 0 0 0 0 0 1
// 1 1 1 1 0 0 0 0
// 0 0 0 0 1 1 1 1
// 1 55 39 73 84 181 225 217
// 1 172 70 235 143 34 200 101
// FIRST UNRECOVERABLE:
// 0 1 0 0 0 0 0 0
// 0 0 1 0 0 0 0 0
// 0 0 0 0 1 0 0 0
// 0 0 0 0 0 1 0 0
// 1 1 1 1 0 0 0 0
// 0 0 0 0 1 1 1 1
// 1 55 39 73 84 181 225 217
// 1 172 70 235 143 34 200 101
// SECOND UNRECOVERABLE:
// 0 0 1 0 0 0 0 0
// 0 0 0 1 0 0 0 0
// 0 0 0 0 1 0 0 0
// 0 0 0 0 0 0 1 0
// 1 1 1 1 0 0 0 0
// 0 0 0 0 1 1 1 1
// 1 55 39 73 84 181 225 217
// 1 172 70 235 143 34 200 101
// Ho ho ho

View File

@@ -945,7 +945,7 @@ void calc_rmw_parity_ec(osd_rmw_stripe_t *stripes, int pg_size, int pg_minsize,
{
if (write_osd_set[i])
{
memcpy(subm + item_size*pg_minsize*j, matrix_data + item_size*pg_minsize*(i-pg_minsize), item_size*pg_minsize);
memcpy((uint8_t*)subm + item_size*pg_minsize*j, (uint8_t*)matrix_data + item_size*pg_minsize*(i-pg_minsize), item_size*pg_minsize);
j++;
}
}

View File

@@ -53,6 +53,7 @@ typedef struct VitastorClient
char *etcd_host;
char *etcd_prefix;
char *image;
int skip_parents;
uint64_t inode;
uint64_t pool;
uint64_t size;
@@ -63,6 +64,10 @@ typedef struct VitastorClient
int rdma_gid_index;
int rdma_mtu;
QemuMutex mutex;
uint64_t last_bitmap_inode, last_bitmap_offset, last_bitmap_len;
uint32_t last_bitmap_granularity;
uint8_t *last_bitmap;
} VitastorClient;
typedef struct VitastorRPC
@@ -72,6 +77,9 @@ typedef struct VitastorRPC
QEMUIOVector *iov;
long ret;
int complete;
uint64_t inode, offset, len;
uint32_t bitmap_granularity;
uint8_t *bitmap;
} VitastorRPC;
static void vitastor_co_init_task(BlockDriverState *bs, VitastorRPC *task);
@@ -147,6 +155,7 @@ static void vitastor_parse_filename(const char *filename, QDict *options, Error
if (!strcmp(name, "inode") ||
!strcmp(name, "pool") ||
!strcmp(name, "size") ||
!strcmp(name, "skip-parents") ||
!strcmp(name, "use-rdma") ||
!strcmp(name, "rdma-port_num") ||
!strcmp(name, "rdma-gid-index") ||
@@ -227,13 +236,16 @@ static void vitastor_aio_set_fd_handler(void *ctx, int fd, int unused1, IOHandle
static int vitastor_file_open(BlockDriverState *bs, QDict *options, int flags, Error **errp)
{
VitastorRPC task;
VitastorClient *client = bs->opaque;
void *image = NULL;
int64_t ret = 0;
qemu_mutex_init(&client->mutex);
client->config_path = g_strdup(qdict_get_try_str(options, "config-path"));
// FIXME: Rename to etcd_address
client->etcd_host = g_strdup(qdict_get_try_str(options, "etcd-host"));
client->etcd_prefix = g_strdup(qdict_get_try_str(options, "etcd-prefix"));
client->skip_parents = qdict_get_try_int(options, "skip-parents", 0);
client->use_rdma = qdict_get_try_int(options, "use-rdma", -1);
client->rdma_device = g_strdup(qdict_get_try_str(options, "rdma-device"));
client->rdma_port_num = qdict_get_try_int(options, "rdma-port-num", 0);
@@ -243,23 +255,25 @@ static int vitastor_file_open(BlockDriverState *bs, QDict *options, int flags, E
vitastor_aio_set_fd_handler, bdrv_get_aio_context(bs), client->config_path, client->etcd_host, client->etcd_prefix,
client->use_rdma, client->rdma_device, client->rdma_port_num, client->rdma_gid_index, client->rdma_mtu, 0
);
client->image = g_strdup(qdict_get_try_str(options, "image"));
image = client->image = g_strdup(qdict_get_try_str(options, "image"));
client->readonly = (flags & BDRV_O_RDWR) ? 1 : 0;
// Get image metadata (size and readonly flag) or just wait until the client is ready
if (!image)
client->image = (char*)"x";
task.complete = 0;
task.bs = bs;
if (qemu_in_coroutine())
{
vitastor_co_get_metadata(&task);
}
else
{
bdrv_coroutine_enter(bs, qemu_coroutine_create((void(*)(void*))vitastor_co_get_metadata, &task));
BDRV_POLL_WHILE(bs, !task.complete);
}
client->image = image;
if (client->image)
{
// Get image metadata (size and readonly flag)
VitastorRPC task;
task.complete = 0;
task.bs = bs;
if (qemu_in_coroutine())
{
vitastor_co_get_metadata(&task);
}
else
{
bdrv_coroutine_enter(bs, qemu_coroutine_create((void(*)(void*))vitastor_co_get_metadata, &task));
BDRV_POLL_WHILE(bs, !task.complete);
}
client->watch = (void*)task.ret;
client->readonly = client->readonly || vitastor_c_inode_get_readonly(client->watch);
client->size = vitastor_c_inode_get_size(client->watch);
@@ -284,6 +298,7 @@ static int vitastor_file_open(BlockDriverState *bs, QDict *options, int flags, E
client->inode = (client->inode & (((uint64_t)1 << (64-POOL_ID_BITS)) - 1)) | (client->pool << (64-POOL_ID_BITS));
}
client->size = qdict_get_try_int(options, "size", 0);
vitastor_c_close_watch(client->proxy, (void*)task.ret);
}
if (!client->size)
{
@@ -305,6 +320,7 @@ static int vitastor_file_open(BlockDriverState *bs, QDict *options, int flags, E
qdict_del(options, "inode");
qdict_del(options, "pool");
qdict_del(options, "size");
qdict_del(options, "skip-parents");
return ret;
}
@@ -321,6 +337,8 @@ static void vitastor_close(BlockDriverState *bs)
g_free(client->etcd_prefix);
if (client->image)
g_free(client->image);
free(client->last_bitmap);
client->last_bitmap = NULL;
}
#if QEMU_VERSION_MAJOR >= 3 || QEMU_VERSION_MAJOR == 2 && QEMU_VERSION_MINOR > 2
@@ -486,6 +504,13 @@ static int coroutine_fn vitastor_co_pwritev(BlockDriverState *bs,
vitastor_co_init_task(bs, &task);
task.iov = iov;
if (client->last_bitmap)
{
// Invalidate last bitmap on write
free(client->last_bitmap);
client->last_bitmap = NULL;
}
uint64_t inode = client->watch ? vitastor_c_inode_get_num(client->watch) : client->inode;
qemu_mutex_lock(&client->mutex);
vitastor_c_write(client->proxy, inode, offset, bytes, 0, iov->iov, iov->niov, vitastor_co_generic_bh_cb, &task);
@@ -499,6 +524,140 @@ static int coroutine_fn vitastor_co_pwritev(BlockDriverState *bs,
return task.ret;
}
#if defined VITASTOR_C_API_VERSION && VITASTOR_C_API_VERSION >= 1
#if QEMU_VERSION_MAJOR >= 2 || QEMU_VERSION_MAJOR == 1 && QEMU_VERSION_MINOR >= 7
static void vitastor_co_read_bitmap_cb(void *opaque, long retval, uint8_t *bitmap)
{
VitastorRPC *task = opaque;
VitastorClient *client = task->bs->opaque;
task->ret = retval;
task->complete = 1;
if (retval >= 0)
{
task->bitmap = bitmap;
if (client->last_bitmap_inode == task->inode &&
client->last_bitmap_offset == task->offset &&
client->last_bitmap_len == task->len)
{
free(client->last_bitmap);
client->last_bitmap = bitmap;
}
}
if (qemu_coroutine_self() != task->co)
{
#if QEMU_VERSION_MAJOR >= 3 || QEMU_VERSION_MAJOR == 2 && QEMU_VERSION_MINOR > 8
aio_co_wake(task->co);
#else
qemu_coroutine_enter(task->co, NULL);
qemu_aio_release(task);
#endif
}
}
static int coroutine_fn vitastor_co_block_status(
BlockDriverState *bs, bool want_zero, int64_t offset, int64_t bytes,
int64_t *pnum, int64_t *map, BlockDriverState **file)
{
// Allocated => return BDRV_BLOCK_DATA|BDRV_BLOCK_OFFSET_VALID
// Not allocated => return 0
// Error => return -errno
// Set pnum to length of the extent, `*map` = `offset`, `*file` = `bs`
VitastorRPC task;
VitastorClient *client = bs->opaque;
uint64_t inode = client->watch ? vitastor_c_inode_get_num(client->watch) : client->inode;
uint8_t bit = 0;
if (client->last_bitmap && client->last_bitmap_inode == inode &&
client->last_bitmap_offset <= offset &&
client->last_bitmap_offset+client->last_bitmap_len >= (want_zero ? offset+1 : offset+bytes))
{
// Use the previously read bitmap
task.bitmap_granularity = client->last_bitmap_granularity;
task.offset = client->last_bitmap_offset;
task.len = client->last_bitmap_len;
task.bitmap = client->last_bitmap;
}
else
{
// Read bitmap from this position, rounding to full inode PG blocks
uint32_t block_size = vitastor_c_inode_get_block_size(client->proxy, inode);
if (!block_size)
return -EAGAIN;
// Init coroutine
vitastor_co_init_task(bs, &task);
free(client->last_bitmap);
task.inode = client->last_bitmap_inode = inode;
task.bitmap_granularity = client->last_bitmap_granularity = vitastor_c_inode_get_bitmap_granularity(client->proxy, inode);
task.offset = client->last_bitmap_offset = offset / block_size * block_size;
task.len = client->last_bitmap_len = (offset+bytes+block_size-1) / block_size * block_size - task.offset;
task.bitmap = client->last_bitmap = NULL;
qemu_mutex_lock(&client->mutex);
vitastor_c_read_bitmap(client->proxy, task.inode, task.offset, task.len, !client->skip_parents, vitastor_co_read_bitmap_cb, &task);
qemu_mutex_unlock(&client->mutex);
while (!task.complete)
{
qemu_coroutine_yield();
}
if (task.ret < 0)
{
// Error
return task.ret;
}
}
if (want_zero)
{
// Get precise mapping with all holes
uint64_t bmp_pos = (offset-task.offset) / task.bitmap_granularity;
uint64_t bmp_len = task.len / task.bitmap_granularity;
uint64_t bmp_end = bmp_pos+1;
bit = (task.bitmap[bmp_pos >> 3] >> (bmp_pos & 0x7)) & 1;
while (bmp_end < bmp_len && ((task.bitmap[bmp_end >> 3] >> (bmp_end & 0x7)) & 1) == bit)
{
bmp_end++;
}
*pnum = (bmp_end-bmp_pos) * task.bitmap_granularity;
}
else
{
// Get larger allocated extents, possibly with false positives
uint64_t bmp_pos = (offset-task.offset) / task.bitmap_granularity;
uint64_t bmp_end = (offset+bytes-task.offset) / task.bitmap_granularity - bmp_pos;
while (bmp_pos < bmp_end)
{
if (!(bmp_pos & 7) && bmp_end >= bmp_pos+8)
{
bit = bit || task.bitmap[bmp_pos >> 3];
bmp_pos += 8;
}
else
{
bit = bit || ((task.bitmap[bmp_pos >> 3] >> (bmp_pos & 0x7)) & 1);
bmp_pos++;
}
}
*pnum = bytes;
}
if (bit)
{
*map = offset;
*file = bs;
}
return (bit ? (BDRV_BLOCK_DATA|BDRV_BLOCK_OFFSET_VALID) : 0);
}
#endif
#if QEMU_VERSION_MAJOR == 1 && QEMU_VERSION_MINOR >= 7 || QEMU_VERSION_MAJOR == 2 && QEMU_VERSION_MINOR < 12
// QEMU 1.7-2.11
static int64_t coroutine_fn vitastor_co_get_block_status(BlockDriverState *bs,
int64_t sector_num, int nb_sectors, int *pnum, BlockDriverState **file)
{
int64_t map = 0;
int64_t pnumbytes = 0;
int r = vitastor_co_block_status(bs, 1, sector_num*BDRV_SECTOR_SIZE, nb_sectors*BDRV_SECTOR_SIZE, &pnumbytes, &map, &file);
*pnum = pnumbytes/BDRV_SECTOR_SIZE;
return r;
}
#endif
#endif
#if !( QEMU_VERSION_MAJOR >= 3 || QEMU_VERSION_MAJOR == 2 && QEMU_VERSION_MINOR >= 7 )
static int coroutine_fn vitastor_co_readv(BlockDriverState *bs, int64_t sector_num, int nb_sectors, QEMUIOVector *iov)
{
@@ -606,6 +765,15 @@ static BlockDriver bdrv_vitastor = {
.bdrv_co_truncate = vitastor_co_truncate,
#endif
#if defined VITASTOR_C_API_VERSION && VITASTOR_C_API_VERSION >= 1
#if QEMU_VERSION_MAJOR >= 3 || QEMU_VERSION_MAJOR == 2 && QEMU_VERSION_MINOR >= 12
// For snapshot export
.bdrv_co_block_status = vitastor_co_block_status,
#elif QEMU_VERSION_MAJOR == 1 && QEMU_VERSION_MINOR >= 7 || QEMU_VERSION_MAJOR == 2 && QEMU_VERSION_MINOR < 12
.bdrv_co_get_block_status = vitastor_co_get_block_status,
#endif
#endif
#if QEMU_VERSION_MAJOR >= 3 || QEMU_VERSION_MAJOR == 2 && QEMU_VERSION_MINOR >= 7
.bdrv_co_preadv = vitastor_co_preadv,
.bdrv_co_pwritev = vitastor_co_pwritev,

View File

@@ -6,7 +6,7 @@ includedir=${prefix}/@CMAKE_INSTALL_INCLUDEDIR@
Name: Vitastor
Description: Vitastor client library
Version: 0.8.3
Version: 0.8.4
Libs: -L${libdir} -lvitastor_client
Cflags: -I${includedir}

View File

@@ -207,6 +207,28 @@ void vitastor_c_write(vitastor_c *client, uint64_t inode, uint64_t offset, uint6
client->cli->execute(op);
}
void vitastor_c_read_bitmap(vitastor_c *client, uint64_t inode, uint64_t offset, uint64_t len,
int with_parents, VitastorReadBitmapHandler cb, void *opaque)
{
cluster_op_t *op = new cluster_op_t;
op->opcode = with_parents ? OSD_OP_READ_CHAIN_BITMAP : OSD_OP_READ_BITMAP;
op->inode = inode;
op->offset = offset;
op->len = len;
op->callback = [cb, opaque](cluster_op_t *op)
{
uint8_t *bitmap = NULL;
if (op->retval >= 0)
{
bitmap = (uint8_t*)op->bitmap_buf;
op->bitmap_buf = NULL;
}
cb(opaque, op->retval, bitmap);
delete op;
};
client->cli->execute(op);
}
void vitastor_c_sync(vitastor_c *client, VitastorIOHandler cb, void *opaque)
{
cluster_op_t *op = new cluster_op_t;
@@ -245,6 +267,25 @@ uint64_t vitastor_c_inode_get_num(void *handle)
return watch->cfg.num;
}
uint32_t vitastor_c_inode_get_block_size(vitastor_c *client, uint64_t inode_num)
{
auto pool_it = client->cli->st_cli.pool_config.find(INODE_POOL(inode_num));
if (pool_it == client->cli->st_cli.pool_config.end())
return 0;
auto & pool_cfg = pool_it->second;
uint32_t pg_data_size = (pool_cfg.scheme == POOL_SCHEME_REPLICATED ? 1 : pool_cfg.pg_size-pool_cfg.parity_chunks);
return pool_cfg.data_block_size * pg_data_size;
}
uint32_t vitastor_c_inode_get_bitmap_granularity(vitastor_c *client, uint64_t inode_num)
{
auto pool_it = client->cli->st_cli.pool_config.find(INODE_POOL(inode_num));
if (pool_it == client->cli->st_cli.pool_config.end())
return 0;
// FIXME: READ_BITMAP may fails if parent bitmap granularity differs from inode bitmap granularity
return pool_it->second.bitmap_granularity;
}
int vitastor_c_inode_get_readonly(void *handle)
{
inode_watch_t *watch = (inode_watch_t*)handle;

View File

@@ -6,6 +6,9 @@
#ifndef VITASTOR_QEMU_PROXY_H
#define VITASTOR_QEMU_PROXY_H
// C API wrapper version
#define VITASTOR_C_API_VERSION 1
#ifndef POOL_ID_BITS
#define POOL_ID_BITS 16
#endif
@@ -21,6 +24,7 @@ typedef struct vitastor_c vitastor_c;
typedef void VitastorReadHandler(void *opaque, long retval, uint64_t version);
typedef void VitastorIOHandler(void *opaque, long retval);
typedef void VitastorReadBitmapHandler(void *opaque, long retval, uint8_t *bitmap);
// QEMU
typedef void IOHandler(void *opaque);
@@ -42,11 +46,15 @@ void vitastor_c_read(vitastor_c *client, uint64_t inode, uint64_t offset, uint64
struct iovec *iov, int iovcnt, VitastorReadHandler cb, void *opaque);
void vitastor_c_write(vitastor_c *client, uint64_t inode, uint64_t offset, uint64_t len, uint64_t check_version,
struct iovec *iov, int iovcnt, VitastorIOHandler cb, void *opaque);
void vitastor_c_read_bitmap(vitastor_c *client, uint64_t inode, uint64_t offset, uint64_t len,
int with_parents, VitastorReadBitmapHandler cb, void *opaque);
void vitastor_c_sync(vitastor_c *client, VitastorIOHandler cb, void *opaque);
void vitastor_c_watch_inode(vitastor_c *client, char *image, VitastorIOHandler cb, void *opaque);
void vitastor_c_close_watch(vitastor_c *client, void *handle);
uint64_t vitastor_c_inode_get_size(void *handle);
uint64_t vitastor_c_inode_get_num(void *handle);
uint32_t vitastor_c_inode_get_block_size(vitastor_c *client, uint64_t inode_num);
uint32_t vitastor_c_inode_get_bitmap_granularity(vitastor_c *client, uint64_t inode_num);
int vitastor_c_inode_get_readonly(void *handle);
#ifdef __cplusplus

View File

@@ -22,6 +22,16 @@ LD_PRELOAD="build/src/libfio_vitastor.so" \
LD_PRELOAD="build/src/libfio_vitastor.so" \
fio -thread -name=test -ioengine=build/src/libfio_vitastor.so -bs=4M -direct=1 -iodepth=1 -rw=read -etcd=$ETCD_URL -pool=1 -inode=3 -size=32M
qemu-img convert -p \
-f raw "vitastor:etcd_host=127.0.0.1\:$ETCD_PORT/v3:pool=1:inode=2:size=$((32*1024*1024)):skip-parents=1" \
-O qcow2 ./testdata/layer0.qcow2
qemu-img create -f qcow2 ./testdata/empty.qcow2 32M
qemu-img convert -p \
-f raw "vitastor:etcd_host=127.0.0.1\:$ETCD_PORT/v3:pool=1:inode=3:size=$((32*1024*1024)):skip-parents=1" \
-O qcow2 -o 'cluster_size=4k' -B empty.qcow2 ./testdata/layer1.qcow2
qemu-img convert -S 4096 -p \
-f raw "vitastor:etcd_host=127.0.0.1\:$ETCD_PORT/v3:pool=1:inode=3:size=$((32*1024*1024))" \
-O raw ./testdata/merged.bin
@@ -52,4 +62,18 @@ qemu-img convert -S 4096 -p \
cmp ./testdata/merged.bin ./testdata/merged-by-tool.bin
# Test merge by qemu-img
qemu-img rebase -u -b layer0.qcow2 ./testdata/layer1.qcow2
qemu-img convert -S 4096 -f qcow2 ./testdata/layer1.qcow2 -O raw ./testdata/rebased.bin
cmp ./testdata/merged.bin ./testdata/rebased.bin
qemu-img rebase -u -b '' ./testdata/layer1.qcow2
qemu-img convert -S 4096 -f qcow2 ./testdata/layer1.qcow2 -O raw ./testdata/rebased.bin
cmp ./testdata/layer1.bin ./testdata/rebased.bin
format_green OK