Compare commits
8 Commits
lrc-matrix
...
v0.8.4
Author | SHA1 | Date | |
---|---|---|---|
81fc8bb94c | |||
bc465c16de | |||
8763e9211c | |||
9e1a80bd17 | |||
3e280f2f08 | |||
fe87b4076b | |||
a38957c1a7 | |||
137309cf29 |
@@ -2,6 +2,6 @@ cmake_minimum_required(VERSION 2.8)
|
||||
|
||||
project(vitastor)
|
||||
|
||||
set(VERSION "0.8.3")
|
||||
set(VERSION "0.8.4")
|
||||
|
||||
add_subdirectory(src)
|
||||
|
@@ -1,4 +1,4 @@
|
||||
VERSION ?= v0.8.3
|
||||
VERSION ?= v0.8.4
|
||||
|
||||
all: build push
|
||||
|
||||
|
@@ -49,7 +49,7 @@ spec:
|
||||
capabilities:
|
||||
add: ["SYS_ADMIN"]
|
||||
allowPrivilegeEscalation: true
|
||||
image: vitalif/vitastor-csi:v0.8.3
|
||||
image: vitalif/vitastor-csi:v0.8.4
|
||||
args:
|
||||
- "--node=$(NODE_ID)"
|
||||
- "--endpoint=$(CSI_ENDPOINT)"
|
||||
|
@@ -116,7 +116,7 @@ spec:
|
||||
privileged: true
|
||||
capabilities:
|
||||
add: ["SYS_ADMIN"]
|
||||
image: vitalif/vitastor-csi:v0.8.3
|
||||
image: vitalif/vitastor-csi:v0.8.4
|
||||
args:
|
||||
- "--node=$(NODE_ID)"
|
||||
- "--endpoint=$(CSI_ENDPOINT)"
|
||||
|
@@ -5,7 +5,7 @@ package vitastor
|
||||
|
||||
const (
|
||||
vitastorCSIDriverName = "csi.vitastor.io"
|
||||
vitastorCSIDriverVersion = "0.8.3"
|
||||
vitastorCSIDriverVersion = "0.8.4"
|
||||
)
|
||||
|
||||
// Config struct fills the parameters of request or user input
|
||||
|
4
debian/changelog
vendored
4
debian/changelog
vendored
@@ -1,10 +1,10 @@
|
||||
vitastor (0.8.3-1) unstable; urgency=medium
|
||||
vitastor (0.8.4-1) unstable; urgency=medium
|
||||
|
||||
* Bugfixes
|
||||
|
||||
-- Vitaliy Filippov <vitalif@yourcmc.ru> Fri, 03 Jun 2022 02:09:44 +0300
|
||||
|
||||
vitastor (0.8.3-1) unstable; urgency=medium
|
||||
vitastor (0.8.4-1) unstable; urgency=medium
|
||||
|
||||
* Implement NFS proxy
|
||||
* Add documentation
|
||||
|
8
debian/vitastor.Dockerfile
vendored
8
debian/vitastor.Dockerfile
vendored
@@ -34,8 +34,8 @@ RUN set -e -x; \
|
||||
mkdir -p /root/packages/vitastor-$REL; \
|
||||
rm -rf /root/packages/vitastor-$REL/*; \
|
||||
cd /root/packages/vitastor-$REL; \
|
||||
cp -r /root/vitastor vitastor-0.8.3; \
|
||||
cd vitastor-0.8.3; \
|
||||
cp -r /root/vitastor vitastor-0.8.4; \
|
||||
cd vitastor-0.8.4; \
|
||||
ln -s /root/fio-build/fio-*/ ./fio; \
|
||||
FIO=$(head -n1 fio/debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
|
||||
ls /usr/include/linux/raw.h || cp ./debian/raw.h /usr/include/linux/raw.h; \
|
||||
@@ -48,8 +48,8 @@ RUN set -e -x; \
|
||||
rm -rf a b; \
|
||||
echo "dep:fio=$FIO" > debian/fio_version; \
|
||||
cd /root/packages/vitastor-$REL; \
|
||||
tar --sort=name --mtime='2020-01-01' --owner=0 --group=0 --exclude=debian -cJf vitastor_0.8.3.orig.tar.xz vitastor-0.8.3; \
|
||||
cd vitastor-0.8.3; \
|
||||
tar --sort=name --mtime='2020-01-01' --owner=0 --group=0 --exclude=debian -cJf vitastor_0.8.4.orig.tar.xz vitastor-0.8.4; \
|
||||
cd vitastor-0.8.4; \
|
||||
V=$(head -n1 debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
|
||||
DEBFULLNAME="Vitaliy Filippov <vitalif@yourcmc.ru>" dch -D $REL -v "$V""$REL" "Rebuild for $REL"; \
|
||||
DEB_BUILD_OPTIONS=nocheck dpkg-buildpackage --jobs=auto -sa; \
|
||||
|
@@ -9,7 +9,7 @@
|
||||
## Debian
|
||||
|
||||
- Trust Vitastor package signing key:
|
||||
`wget -q -O - https://vitastor.io/debian/pubkey | sudo apt-key add -`
|
||||
`wget https://vitastor.io/debian/pubkey.gpg -O /etc/apt/trusted.gpg.d/vitastor.gpg`
|
||||
- Add Vitastor package repository to your /etc/apt/sources.list:
|
||||
- Debian 11 (Bullseye/Sid): `deb https://vitastor.io/debian bullseye main`
|
||||
- Debian 10 (Buster): `deb https://vitastor.io/debian buster main`
|
||||
|
@@ -9,7 +9,7 @@
|
||||
## Debian
|
||||
|
||||
- Добавьте ключ репозитория Vitastor:
|
||||
`wget -q -O - https://vitastor.io/debian/pubkey | sudo apt-key add -`
|
||||
`wget https://vitastor.io/debian/pubkey.gpg -O /etc/apt/trusted.gpg.d/vitastor.gpg`
|
||||
- Добавьте репозиторий Vitastor в /etc/apt/sources.list:
|
||||
- Debian 11 (Bullseye/Sid): `deb https://vitastor.io/debian bullseye main`
|
||||
- Debian 10 (Buster): `deb https://vitastor.io/debian buster main`
|
||||
|
@@ -14,6 +14,7 @@ It supports the following commands:
|
||||
- [df](#df)
|
||||
- [ls](#ls)
|
||||
- [create](#create)
|
||||
- [snap-create](#create)
|
||||
- [modify](#modify)
|
||||
- [rm](#rm)
|
||||
- [flatten](#flatten)
|
||||
@@ -123,6 +124,8 @@ vitastor-cli snap-create [-p|--pool <id|name>] <image>@<snapshot>
|
||||
|
||||
Create a snapshot of image `<name>` (either form can be used). May be used live if only a single writer is active.
|
||||
|
||||
See also about [how to export snapshots](qemu.en.md#exporting-snapshots).
|
||||
|
||||
## modify
|
||||
|
||||
`vitastor-cli modify <name> [--rename <new-name>] [--resize <size>] [--readonly | --readwrite] [-f|--force]`
|
||||
|
@@ -15,6 +15,7 @@ vitastor-cli - интерфейс командной строки для адм
|
||||
- [df](#df)
|
||||
- [ls](#ls)
|
||||
- [create](#create)
|
||||
- [snap-create](#create)
|
||||
- [modify](#modify)
|
||||
- [rm](#rm)
|
||||
- [flatten](#flatten)
|
||||
@@ -126,6 +127,8 @@ vitastor-cli snap-create [-p|--pool <id|name>] <image>@<snapshot>
|
||||
Создать снимок образа `<name>` (можно использовать любую форму команды). Снимок можно создавать без остановки
|
||||
клиентов, если пишущий клиент максимум 1.
|
||||
|
||||
Смотрите также информацию о том, [как экспортировать снимки](qemu.ru.md#экспорт-снимков).
|
||||
|
||||
## modify
|
||||
|
||||
`vitastor-cli modify <name> [--rename <new-name>] [--resize <size>] [--readonly | --readwrite] [-f|--force]`
|
||||
|
@@ -46,3 +46,40 @@ qemu-img convert -f qcow2 debian10.qcow2 -p -O raw 'vitastor:etcd_host=192.168.7
|
||||
|
||||
You can also specify `:pool=<POOL>:inode=<INODE>:size=<SIZE>` instead of `:image=<IMAGE>`
|
||||
if you don't want to use inode metadata.
|
||||
|
||||
### Exporting snapshots
|
||||
|
||||
Starting with 0.8.4, you can also export individual layers (snapshot diffs) using `qemu-img`.
|
||||
|
||||
Suppose you have an image `testimg` and a snapshot `testimg@0` created with `vitastor-cli snap-create testimg@0`.
|
||||
|
||||
Then you can export the `testimg@0` snapshot and the data written to `testimg` after creating
|
||||
the snapshot separately using the following commands (key points are using `skip-parents=1` and
|
||||
`-B backing_file` option):
|
||||
|
||||
```
|
||||
qemu-img convert -f raw 'vitastor:etcd_host=192.168.7.2\:2379/v3:image=testimg@0' \
|
||||
-O qcow2 testimg_0.qcow2
|
||||
|
||||
qemu-img convert -f raw 'vitastor:etcd_host=192.168.7.2\:2379/v3:image=testimg:skip-parents=1' \
|
||||
-O qcow2 -o 'cluster_size=4k' -B testimg_0.qcow2 testimg.qcow2
|
||||
```
|
||||
|
||||
In fact, with `cluster_size=4k` any QCOW2 file can be used instead `-B testimg_0.qcow2`, even an empty one.
|
||||
|
||||
QCOW2 `cluster_size=4k` option is required if you want `testimg.qcow2` to contain only the data
|
||||
overwritten **exactly** in the child layer. With the default 64 KB QCOW2 cluster size you'll
|
||||
get a bit of extra data from parent layers, i.e. a 4 KB overwrite will result in `testimg.qcow2`
|
||||
containing 64 KB of data. And this extra data will be taken by `qemu-img` from the file passed
|
||||
in `-B` option, so you really need 4 KB cluster if you use an empty image in `-B`.
|
||||
|
||||
After this procedure you'll get two chained QCOW2 images. To detach `testimg.qcow2` from
|
||||
its parent, run:
|
||||
|
||||
```
|
||||
qemu-img rebase -u -b '' testimg.qcow2
|
||||
```
|
||||
|
||||
This can be used for backups. Just note that exporting an image that is currently being written to
|
||||
is of course unsafe and doesn't produce a consistent result, so only export snapshots if you do this
|
||||
on a live VM.
|
||||
|
@@ -50,3 +50,40 @@ qemu-img convert -f qcow2 debian10.qcow2 -p -O raw 'vitastor:etcd_host=10.115.0.
|
||||
|
||||
Если вы не хотите обращаться к образу по имени, вместо `:image=<IMAGE>` можно указать номер пула, номер инода и размер:
|
||||
`:pool=<POOL>:inode=<INODE>:size=<SIZE>`.
|
||||
|
||||
### Экспорт снимков
|
||||
|
||||
Начиная с 0.8.4 вы можете экспортировать отдельные слои (изменения в снимках) с помощью `qemu-img`.
|
||||
|
||||
Допустим, что у вас есть образ `testimg` и его снимок `testimg@0`, созданный с помощью `vitastor-cli snap-create testimg@0`.
|
||||
|
||||
Тогда вы можете выгрузить снимок `testimg@0` и данные, изменённые в `testimg` после создания снимка, отдельно,
|
||||
с помощью следующих команд (ключевые моменты - использование `skip-parents=1` и опции `-B backing_file.qcow2`):
|
||||
|
||||
```
|
||||
qemu-img convert -f raw 'vitastor:etcd_host=192.168.7.2\:2379/v3:image=testimg@0' \
|
||||
-O qcow2 testimg_0.qcow2
|
||||
|
||||
qemu-img convert -f raw 'vitastor:etcd_host=192.168.7.2\:2379/v3:image=testimg:skip-parents=1' \
|
||||
-O qcow2 -o 'cluster_size=4k' -B testimg_0.qcow2 testimg.qcow2
|
||||
```
|
||||
|
||||
На самом деле, с `cluster_size=4k` вместо `-B testimg_0.qcow2` можно использовать любой qcow2-файл,
|
||||
даже пустой.
|
||||
|
||||
Опция QCOW2 `cluster_size=4k` нужна, если вы хотите, чтобы `testimg.qcow2` содержал **в точности**
|
||||
данные, перезаписанные в дочернем слое. С размером кластера QCOW2 по умолчанию, составляющим 64 КБ,
|
||||
вы получите немного "лишних" данных из родительских слоёв - перезапись 4 КБ будет приводить к тому,
|
||||
что в `testimg.qcow2` будет появляться 64 КБ данных. Причём "лишние" данные qemu-img будет брать
|
||||
как раз из файла, указанного в опции `-B`, так что если там указан пустой образ, кластер обязан быть 4 КБ.
|
||||
|
||||
После данной процедуры вы получите два QCOW2-образа, связанных в цепочку. Чтобы "отцепить" образ
|
||||
`testimg.qcow2` от базового, выполните:
|
||||
|
||||
```
|
||||
qemu-img rebase -u -b '' testimg.qcow2
|
||||
```
|
||||
|
||||
Это можно использовать для резервного копирования. Только помните, что экспортировать образ, в который
|
||||
в то же время идёт запись, небезопасно - результат чтения не будет целостным. Так что если вы работаете
|
||||
с активными виртуальными машинами, экспортируйте только их снимки, но не сам образ.
|
||||
|
@@ -550,8 +550,8 @@ function random_combinations(osd_tree, pg_size, count, ordered)
|
||||
seed ^= seed << 5;
|
||||
return seed + 2147483648;
|
||||
};
|
||||
const hosts = Object.keys(osd_tree).sort();
|
||||
const osds = Object.keys(osd_tree).reduce((a, c) => { a[c] = Object.keys(osd_tree[c]).sort(); return a; }, {});
|
||||
const hosts = Object.keys(osd_tree).sort().filter(h => osds[h].length > 0);
|
||||
const r = {};
|
||||
// Generate random combinations including each OSD at least once
|
||||
for (let h = 0; h < hosts.length; h++)
|
||||
|
@@ -16,6 +16,11 @@ use PVE::Tools qw(run_command);
|
||||
|
||||
use base qw(PVE::Storage::Plugin);
|
||||
|
||||
if (@PVE::Storage::Plugin::SHARED_STORAGE)
|
||||
{
|
||||
push @PVE::Storage::Plugin::SHARED_STORAGE, 'vitastor';
|
||||
}
|
||||
|
||||
sub api
|
||||
{
|
||||
# Trick it :)
|
||||
@@ -133,9 +138,10 @@ sub properties
|
||||
sub options
|
||||
{
|
||||
return {
|
||||
shared => { optional => 1 },
|
||||
nodes => { optional => 1 },
|
||||
disable => { optional => 1 },
|
||||
vitastor_etcd_address => { optional => 1},
|
||||
vitastor_etcd_address => { optional => 1 },
|
||||
vitastor_etcd_prefix => { optional => 1 },
|
||||
vitastor_config_path => { optional => 1 },
|
||||
vitastor_prefix => { optional => 1 },
|
||||
|
@@ -50,7 +50,7 @@ from cinder.volume import configuration
|
||||
from cinder.volume import driver
|
||||
from cinder.volume import volume_utils
|
||||
|
||||
VERSION = '0.8.3'
|
||||
VERSION = '0.8.4'
|
||||
|
||||
LOG = logging.getLogger(__name__)
|
||||
|
||||
|
@@ -25,4 +25,4 @@ rm fio
|
||||
mv fio-copy fio
|
||||
FIO=`rpm -qi fio | perl -e 'while(<>) { /^Epoch[\s:]+(\S+)/ && print "$1:"; /^Version[\s:]+(\S+)/ && print $1; /^Release[\s:]+(\S+)/ && print "-$1"; }'`
|
||||
perl -i -pe 's/(Requires:\s*fio)([^\n]+)?/$1 = '$FIO'/' $VITASTOR/rpm/vitastor-el$EL.spec
|
||||
tar --transform 's#^#vitastor-0.8.3/#' --exclude 'rpm/*.rpm' -czf $VITASTOR/../vitastor-0.8.3$(rpm --eval '%dist').tar.gz *
|
||||
tar --transform 's#^#vitastor-0.8.4/#' --exclude 'rpm/*.rpm' -czf $VITASTOR/../vitastor-0.8.4$(rpm --eval '%dist').tar.gz *
|
||||
|
@@ -35,7 +35,7 @@ ADD . /root/vitastor
|
||||
RUN set -e; \
|
||||
cd /root/vitastor/rpm; \
|
||||
sh build-tarball.sh; \
|
||||
cp /root/vitastor-0.8.3.el7.tar.gz ~/rpmbuild/SOURCES; \
|
||||
cp /root/vitastor-0.8.4.el7.tar.gz ~/rpmbuild/SOURCES; \
|
||||
cp vitastor-el7.spec ~/rpmbuild/SPECS/vitastor.spec; \
|
||||
cd ~/rpmbuild/SPECS/; \
|
||||
rpmbuild -ba vitastor.spec; \
|
||||
|
@@ -1,11 +1,11 @@
|
||||
Name: vitastor
|
||||
Version: 0.8.3
|
||||
Version: 0.8.4
|
||||
Release: 1%{?dist}
|
||||
Summary: Vitastor, a fast software-defined clustered block storage
|
||||
|
||||
License: Vitastor Network Public License 1.1
|
||||
URL: https://vitastor.io/
|
||||
Source0: vitastor-0.8.3.el7.tar.gz
|
||||
Source0: vitastor-0.8.4.el7.tar.gz
|
||||
|
||||
BuildRequires: liburing-devel >= 0.6
|
||||
BuildRequires: gperftools-devel
|
||||
|
@@ -35,7 +35,7 @@ ADD . /root/vitastor
|
||||
RUN set -e; \
|
||||
cd /root/vitastor/rpm; \
|
||||
sh build-tarball.sh; \
|
||||
cp /root/vitastor-0.8.3.el8.tar.gz ~/rpmbuild/SOURCES; \
|
||||
cp /root/vitastor-0.8.4.el8.tar.gz ~/rpmbuild/SOURCES; \
|
||||
cp vitastor-el8.spec ~/rpmbuild/SPECS/vitastor.spec; \
|
||||
cd ~/rpmbuild/SPECS/; \
|
||||
rpmbuild -ba vitastor.spec; \
|
||||
|
@@ -1,11 +1,11 @@
|
||||
Name: vitastor
|
||||
Version: 0.8.3
|
||||
Version: 0.8.4
|
||||
Release: 1%{?dist}
|
||||
Summary: Vitastor, a fast software-defined clustered block storage
|
||||
|
||||
License: Vitastor Network Public License 1.1
|
||||
URL: https://vitastor.io/
|
||||
Source0: vitastor-0.8.3.el8.tar.gz
|
||||
Source0: vitastor-0.8.4.el8.tar.gz
|
||||
|
||||
BuildRequires: liburing-devel >= 0.6
|
||||
BuildRequires: gperftools-devel
|
||||
|
@@ -15,7 +15,7 @@ if("${CMAKE_INSTALL_PREFIX}" MATCHES "^/usr/local/?$")
|
||||
set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}")
|
||||
endif()
|
||||
|
||||
add_definitions(-DVERSION="0.8.3")
|
||||
add_definitions(-DVERSION="0.8.4")
|
||||
add_definitions(-Wall -Wno-sign-compare -Wno-comment -Wno-parentheses -Wno-pointer-arith -fdiagnostics-color=always -I ${CMAKE_SOURCE_DIR}/src)
|
||||
if (${WITH_ASAN})
|
||||
add_definitions(-fsanitize=address -fno-omit-frame-pointer)
|
||||
|
@@ -403,7 +403,7 @@ struct snap_merger_t
|
||||
op->opcode = OSD_OP_READ_BITMAP;
|
||||
op->inode = target;
|
||||
op->offset = offset;
|
||||
op->len = 0;
|
||||
op->len = target_block_size;
|
||||
op->callback = [this](cluster_op_t *op)
|
||||
{
|
||||
if (op->retval < 0)
|
||||
|
@@ -143,7 +143,7 @@ void cluster_client_t::calc_wait(cluster_op_t *op)
|
||||
if (!op->prev_wait)
|
||||
continue_sync(op);
|
||||
}
|
||||
else /* if (op->opcode == OSD_OP_READ || op->opcode == OSD_OP_READ_BITMAP) */
|
||||
else /* if (op->opcode == OSD_OP_READ || op->opcode == OSD_OP_READ_BITMAP || op->opcode == OSD_OP_READ_CHAIN_BITMAP) */
|
||||
{
|
||||
for (auto prev = op_queue_head; prev && prev != op; prev = prev->next)
|
||||
{
|
||||
@@ -151,7 +151,8 @@ void cluster_client_t::calc_wait(cluster_op_t *op)
|
||||
{
|
||||
op->prev_wait++;
|
||||
}
|
||||
else if (prev->opcode == OSD_OP_WRITE || prev->opcode == OSD_OP_READ || prev->opcode == OSD_OP_READ_BITMAP)
|
||||
else if (prev->opcode == OSD_OP_WRITE || prev->opcode == OSD_OP_READ ||
|
||||
prev->opcode == OSD_OP_READ_BITMAP || prev->opcode == OSD_OP_READ_CHAIN_BITMAP)
|
||||
{
|
||||
// Flushes are always in the beginning (we're scanning from the beginning of the queue)
|
||||
break;
|
||||
@@ -171,7 +172,8 @@ void cluster_client_t::inc_wait(uint64_t opcode, uint64_t flags, cluster_op_t *n
|
||||
auto n2 = next->next;
|
||||
if (next->opcode == OSD_OP_SYNC && !(flags & OP_IMMEDIATE_COMMIT) ||
|
||||
next->opcode == OSD_OP_WRITE && (flags & OP_FLUSH_BUFFER) && !(next->flags & OP_FLUSH_BUFFER) ||
|
||||
(next->opcode == OSD_OP_READ || next->opcode == OSD_OP_READ_BITMAP) && (flags & OP_FLUSH_BUFFER))
|
||||
(next->opcode == OSD_OP_READ || next->opcode == OSD_OP_READ_BITMAP ||
|
||||
next->opcode == OSD_OP_READ_CHAIN_BITMAP) && (flags & OP_FLUSH_BUFFER))
|
||||
{
|
||||
next->prev_wait += inc;
|
||||
assert(next->prev_wait >= 0);
|
||||
@@ -337,7 +339,8 @@ void cluster_client_t::on_change_hook(std::map<std::string, etcd_kv_t> & changes
|
||||
// And now they have to be resliced!
|
||||
for (auto op = op_queue_head; op; op = op->next)
|
||||
{
|
||||
if ((op->opcode == OSD_OP_WRITE || op->opcode == OSD_OP_READ || op->opcode == OSD_OP_READ_BITMAP) &&
|
||||
if ((op->opcode == OSD_OP_WRITE || op->opcode == OSD_OP_READ ||
|
||||
op->opcode == OSD_OP_READ_BITMAP || op->opcode == OSD_OP_READ_CHAIN_BITMAP) &&
|
||||
INODE_POOL(op->cur_inode) == pool_item.first)
|
||||
{
|
||||
op->needs_reslice = true;
|
||||
@@ -409,7 +412,7 @@ void cluster_client_t::on_ready(std::function<void(void)> fn)
|
||||
void cluster_client_t::execute(cluster_op_t *op)
|
||||
{
|
||||
if (op->opcode != OSD_OP_SYNC && op->opcode != OSD_OP_READ &&
|
||||
op->opcode != OSD_OP_READ_BITMAP && op->opcode != OSD_OP_WRITE)
|
||||
op->opcode != OSD_OP_READ_BITMAP && op->opcode != OSD_OP_READ_CHAIN_BITMAP && op->opcode != OSD_OP_WRITE)
|
||||
{
|
||||
op->retval = -EINVAL;
|
||||
std::function<void(cluster_op_t*)>(op->callback)(op);
|
||||
@@ -441,7 +444,7 @@ void cluster_client_t::execute(cluster_op_t *op)
|
||||
return;
|
||||
}
|
||||
// Check alignment
|
||||
if ((op->opcode == OSD_OP_READ || op->opcode == OSD_OP_WRITE) && !op->len ||
|
||||
if (!op->len && (op->opcode == OSD_OP_READ || op->opcode == OSD_OP_READ_BITMAP || op->opcode == OSD_OP_READ_CHAIN_BITMAP || op->opcode == OSD_OP_WRITE) ||
|
||||
op->offset % pool_it->second.bitmap_granularity || op->len % pool_it->second.bitmap_granularity)
|
||||
{
|
||||
op->retval = -EINVAL;
|
||||
@@ -702,8 +705,7 @@ resume_3:
|
||||
// Finished successfully
|
||||
// Even if the PG count has changed in meanwhile we treat it as success
|
||||
// because if some operations were invalid for the new PG count we'd get errors
|
||||
bool is_read = op->opcode == OSD_OP_READ;
|
||||
if (is_read)
|
||||
if (op->opcode == OSD_OP_READ || op->opcode == OSD_OP_READ_CHAIN_BITMAP)
|
||||
{
|
||||
// Check parent inode
|
||||
auto ino_it = st_cli.inode_config.find(op->cur_inode);
|
||||
@@ -727,6 +729,11 @@ resume_3:
|
||||
}
|
||||
}
|
||||
op->retval = op->len;
|
||||
if (op->opcode == OSD_OP_READ_BITMAP || op->opcode == OSD_OP_READ_CHAIN_BITMAP)
|
||||
{
|
||||
auto & pool_cfg = st_cli.pool_config.at(INODE_POOL(op->inode));
|
||||
op->retval = op->len / pool_cfg.bitmap_granularity;
|
||||
}
|
||||
erase_op(op);
|
||||
return 1;
|
||||
}
|
||||
@@ -809,23 +816,19 @@ void cluster_client_t::slice_rw(cluster_op_t *op)
|
||||
uint64_t last_stripe = op->len > 0 ? ((op->offset + op->len - 1) / pg_block_size) * pg_block_size : first_stripe;
|
||||
op->retval = 0;
|
||||
op->parts.resize((last_stripe - first_stripe) / pg_block_size + 1);
|
||||
if (op->opcode == OSD_OP_READ || op->opcode == OSD_OP_READ_BITMAP)
|
||||
if (op->opcode == OSD_OP_READ || op->opcode == OSD_OP_READ_BITMAP || op->opcode == OSD_OP_READ_CHAIN_BITMAP)
|
||||
{
|
||||
// Allocate memory for the bitmap
|
||||
unsigned object_bitmap_size = (((op->opcode == OSD_OP_READ_BITMAP ? pg_block_size : op->len) / pool_cfg.bitmap_granularity + 7) / 8);
|
||||
unsigned object_bitmap_size = ((op->len / pool_cfg.bitmap_granularity + 7) / 8);
|
||||
object_bitmap_size = (object_bitmap_size < 8 ? 8 : object_bitmap_size);
|
||||
unsigned bitmap_mem = object_bitmap_size + (pool_cfg.data_block_size / pool_cfg.bitmap_granularity / 8 * pg_data_size) * op->parts.size();
|
||||
if (op->bitmap_buf_size < bitmap_mem)
|
||||
if (!op->bitmap_buf || op->bitmap_buf_size < bitmap_mem)
|
||||
{
|
||||
op->bitmap_buf = realloc_or_die(op->bitmap_buf, bitmap_mem);
|
||||
if (!op->bitmap_buf_size)
|
||||
{
|
||||
// First allocation
|
||||
memset(op->bitmap_buf, 0, object_bitmap_size);
|
||||
}
|
||||
op->part_bitmaps = (uint8_t*)op->bitmap_buf + object_bitmap_size;
|
||||
op->bitmap_buf_size = bitmap_mem;
|
||||
}
|
||||
memset(op->bitmap_buf, 0, bitmap_mem);
|
||||
}
|
||||
int iov_idx = 0;
|
||||
size_t iov_pos = 0;
|
||||
@@ -876,13 +879,14 @@ void cluster_client_t::slice_rw(cluster_op_t *op)
|
||||
if (end == begin)
|
||||
op->done_count++;
|
||||
}
|
||||
else if (op->opcode != OSD_OP_READ_BITMAP && op->opcode != OSD_OP_DELETE)
|
||||
else if (op->opcode != OSD_OP_READ_BITMAP && op->opcode != OSD_OP_READ_CHAIN_BITMAP && op->opcode != OSD_OP_DELETE)
|
||||
{
|
||||
add_iov(end-begin, false, op, iov_idx, iov_pos, op->parts[i].iov, NULL, 0);
|
||||
}
|
||||
op->parts[i].parent = op;
|
||||
op->parts[i].offset = begin;
|
||||
op->parts[i].len = op->opcode == OSD_OP_READ_BITMAP || op->opcode == OSD_OP_DELETE ? 0 : (uint32_t)(end - begin);
|
||||
op->parts[i].len = op->opcode == OSD_OP_READ_BITMAP || op->opcode == OSD_OP_READ_CHAIN_BITMAP ||
|
||||
op->opcode == OSD_OP_DELETE ? 0 : (uint32_t)(end - begin);
|
||||
op->parts[i].pg_num = pg_num;
|
||||
op->parts[i].osd_num = 0;
|
||||
op->parts[i].flags = 0;
|
||||
@@ -929,7 +933,7 @@ bool cluster_client_t::try_send(cluster_op_t *op, int i)
|
||||
pool_cfg.scheme == POOL_SCHEME_REPLICATED ? 1 : pool_cfg.pg_size-pool_cfg.parity_chunks
|
||||
);
|
||||
uint64_t meta_rev = 0;
|
||||
if (op->opcode != OSD_OP_READ_BITMAP && op->opcode != OSD_OP_DELETE)
|
||||
if (op->opcode != OSD_OP_READ_BITMAP && op->opcode != OSD_OP_READ_CHAIN_BITMAP && op->opcode != OSD_OP_DELETE)
|
||||
{
|
||||
auto ino_it = st_cli.inode_config.find(op->inode);
|
||||
if (ino_it != st_cli.inode_config.end())
|
||||
@@ -942,7 +946,7 @@ bool cluster_client_t::try_send(cluster_op_t *op, int i)
|
||||
.header = {
|
||||
.magic = SECONDARY_OSD_OP_MAGIC,
|
||||
.id = next_op_id(),
|
||||
.opcode = op->opcode == OSD_OP_READ_BITMAP ? OSD_OP_READ : op->opcode,
|
||||
.opcode = op->opcode == OSD_OP_READ_BITMAP || op->opcode == OSD_OP_READ_CHAIN_BITMAP ? OSD_OP_READ : op->opcode,
|
||||
},
|
||||
.inode = op->cur_inode,
|
||||
.offset = part->offset,
|
||||
@@ -950,8 +954,10 @@ bool cluster_client_t::try_send(cluster_op_t *op, int i)
|
||||
.meta_revision = meta_rev,
|
||||
.version = op->opcode == OSD_OP_WRITE || op->opcode == OSD_OP_DELETE ? op->version : 0,
|
||||
} },
|
||||
.bitmap = (op->opcode == OSD_OP_READ || op->opcode == OSD_OP_READ_BITMAP ? (uint8_t*)op->part_bitmaps + pg_bitmap_size*i : NULL),
|
||||
.bitmap_len = (unsigned)(op->opcode == OSD_OP_READ || op->opcode == OSD_OP_READ_BITMAP ? pg_bitmap_size : 0),
|
||||
.bitmap = (op->opcode == OSD_OP_READ || op->opcode == OSD_OP_READ_BITMAP || op->opcode == OSD_OP_READ_CHAIN_BITMAP
|
||||
? (uint8_t*)op->part_bitmaps + pg_bitmap_size*i : NULL),
|
||||
.bitmap_len = (unsigned)(op->opcode == OSD_OP_READ || op->opcode == OSD_OP_READ_BITMAP || op->opcode == OSD_OP_READ_CHAIN_BITMAP
|
||||
? pg_bitmap_size : 0),
|
||||
.callback = [this, part](osd_op_t *op_part)
|
||||
{
|
||||
handle_op_part(part);
|
||||
@@ -1130,11 +1136,11 @@ void cluster_client_t::handle_op_part(cluster_op_part_t *part)
|
||||
else
|
||||
{
|
||||
// OK
|
||||
if (!(op->flags & OP_IMMEDIATE_COMMIT))
|
||||
if ((op->opcode == OSD_OP_WRITE || op->opcode == OSD_OP_DELETE) && !(op->flags & OP_IMMEDIATE_COMMIT))
|
||||
dirty_osds.insert(part->osd_num);
|
||||
part->flags |= PART_DONE;
|
||||
op->done_count++;
|
||||
if (op->opcode == OSD_OP_READ || op->opcode == OSD_OP_READ_BITMAP)
|
||||
if (op->opcode == OSD_OP_READ || op->opcode == OSD_OP_READ_BITMAP || op->opcode == OSD_OP_READ_CHAIN_BITMAP)
|
||||
{
|
||||
copy_part_bitmap(op, part);
|
||||
op->version = op->parts.size() == 1 ? part->op.reply.rw.version : 0;
|
||||
@@ -1158,7 +1164,12 @@ void cluster_client_t::copy_part_bitmap(cluster_op_t *op, cluster_op_part_t *par
|
||||
);
|
||||
uint32_t object_offset = (part->op.req.rw.offset - op->offset) / pool_cfg.bitmap_granularity;
|
||||
uint32_t part_offset = (part->op.req.rw.offset % pg_block_size) / pool_cfg.bitmap_granularity;
|
||||
uint32_t part_len = (op->opcode == OSD_OP_READ_BITMAP ? pg_block_size : part->op.req.rw.len) / pool_cfg.bitmap_granularity;
|
||||
uint32_t op_len = op->len / pool_cfg.bitmap_granularity;
|
||||
uint32_t part_len = pg_block_size/pool_cfg.bitmap_granularity - part_offset;
|
||||
if (part_len > op_len-object_offset)
|
||||
{
|
||||
part_len = op_len-object_offset;
|
||||
}
|
||||
if (!(object_offset & 0x7) && !(part_offset & 0x7) && (part_len >= 8))
|
||||
{
|
||||
// Copy bytes
|
||||
|
@@ -11,6 +11,7 @@
|
||||
#define INODE_LIST_DONE 1
|
||||
#define INODE_LIST_HAS_UNSTABLE 2
|
||||
#define OSD_OP_READ_BITMAP OSD_OP_SEC_READ_BMP
|
||||
#define OSD_OP_READ_CHAIN_BITMAP 0x102
|
||||
|
||||
#define OSD_OP_IGNORE_READONLY 0x08
|
||||
|
||||
@@ -30,7 +31,7 @@ struct cluster_op_part_t
|
||||
|
||||
struct cluster_op_t
|
||||
{
|
||||
uint64_t opcode; // OSD_OP_READ, OSD_OP_WRITE, OSD_OP_SYNC, OSD_OP_DELETE, OSD_OP_READ_BITMAP
|
||||
uint64_t opcode; // OSD_OP_READ, OSD_OP_WRITE, OSD_OP_SYNC, OSD_OP_DELETE, OSD_OP_READ_BITMAP, OSD_OP_READ_CHAIN_BITMAP
|
||||
uint64_t inode;
|
||||
uint64_t offset;
|
||||
uint64_t len;
|
||||
@@ -39,9 +40,13 @@ struct cluster_op_t
|
||||
uint64_t version = 0;
|
||||
// now only OSD_OP_IGNORE_READONLY is supported
|
||||
uint64_t flags = 0;
|
||||
// negative retval is an error number
|
||||
// write and read return len on success
|
||||
// sync and delete return 0 on success
|
||||
// read_bitmap and read_chain_bitmap return the length of bitmap in bits(!)
|
||||
int retval;
|
||||
osd_op_buf_list_t iov;
|
||||
// READ and READ_BITMAP return the bitmap here
|
||||
// READ, READ_BITMAP, READ_CHAIN_BITMAP return the bitmap here
|
||||
void *bitmap_buf = NULL;
|
||||
std::function<void(cluster_op_t*)> callback;
|
||||
~cluster_op_t();
|
||||
|
@@ -1,2 +0,0 @@
|
||||
mat: mat.c
|
||||
gcc -O3 -I/usr/include/jerasure -o mat mat.c -lJerasure
|
291
src/lrc/mat.c
291
src/lrc/mat.c
@@ -1,291 +0,0 @@
|
||||
#include <jerasure/reed_sol.h>
|
||||
#include <jerasure.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <assert.h>
|
||||
|
||||
// Generate LRC matrix: (groups*local + global) code rows with (data_drives) columns
|
||||
// w should be >= log2(data_drives + groups*local + global), but not necessary 8/16/32
|
||||
int* reed_sol_vandermonde_lrc_matrix(int data_drives, int groups, int local, int global, int w)
|
||||
{
|
||||
if (w < 0 || w > 32 || data_drives + groups*local + global > (1<<w))
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
int *lrc_matrix = (int*)malloc(sizeof(int) * (local*groups+global));
|
||||
int *matrix = reed_sol_vandermonde_coding_matrix(data_drives, local+global, w);
|
||||
// Enough to transform LRC 8+2+2 GF(8) matrix into MR-LRC
|
||||
//for (int i = 0; i < local+global; i++)
|
||||
//{
|
||||
// int t = matrix[i*data_drives + 3];
|
||||
// matrix[i*data_drives + 3] = matrix[i*data_drives + 7];
|
||||
// matrix[i*data_drives + 7] = t;
|
||||
//}
|
||||
for (int gr = 0; gr < groups; gr++)
|
||||
{
|
||||
for (int l = 0; l < local; l++)
|
||||
{
|
||||
for (int j = 0; j < data_drives; j++)
|
||||
{
|
||||
lrc_matrix[(gr*local+l)*data_drives + j] = (j / (data_drives/groups)) == gr ? matrix[l*data_drives + j] : 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
for (int i = 0; i < global; i++)
|
||||
{
|
||||
for (int j = 0; j < data_drives; j++)
|
||||
{
|
||||
lrc_matrix[(groups*local+i)*data_drives + j] = matrix[(local+i)*data_drives + j];
|
||||
}
|
||||
}
|
||||
free(matrix);
|
||||
return lrc_matrix;
|
||||
}
|
||||
|
||||
struct lrc_test_result_t
|
||||
{
|
||||
int success, impossible, failures;
|
||||
};
|
||||
|
||||
// Check if the generated LRC with given parameters is Maximally Reconstructible (MR-LRC)
|
||||
// Example of a MR-LRC: (8, 2, 1, 2, 6, 8)
|
||||
struct lrc_test_result_t check_mr_lrc(int *lrc_matrix, int data_drives, int groups, int local, int global, int w, int log_level)
|
||||
{
|
||||
int n = data_drives;
|
||||
int total_rows = n + groups*local + global;
|
||||
int impossible = 0, success = 0, failures = 0;
|
||||
int *lost_per_group = (int*)malloc(sizeof(int) * groups);
|
||||
int *recovered_per_group = (int*)malloc(sizeof(int) * groups);
|
||||
int *selected_inverted = (int*)malloc(sizeof(int) * data_drives);
|
||||
// global+1 is always recoverable
|
||||
for (int lost = global+2; lost <= groups*local+global; lost++)
|
||||
{
|
||||
int *erased_matrix = (int*)malloc(sizeof(int) * (total_rows-lost)*n);
|
||||
int *inverted_matrix = (int*)malloc(sizeof(int) * (total_rows-lost)*n);
|
||||
int *p = (int*)malloc(sizeof(int) * (total_rows-lost));
|
||||
for (int i = 0; i < n; i++)
|
||||
p[i] = i;
|
||||
int *p2 = (int*)malloc(sizeof(int) * n);
|
||||
if (total_rows-lost > n)
|
||||
{
|
||||
p[n-1] = n; // skip combinations with all N data disks (0..n-1)
|
||||
for (int i = n; i < total_rows-lost; i++)
|
||||
p[i] = i+1;
|
||||
p[total_rows-lost-1]--; // will be incremented on the first step
|
||||
}
|
||||
int inc = total_rows-lost-1;
|
||||
while (1)
|
||||
{
|
||||
p[inc]++;
|
||||
if (p[inc] >= n+groups*local+global)
|
||||
{
|
||||
if (inc == 0)
|
||||
break;
|
||||
inc--;
|
||||
}
|
||||
else if (inc+1 < total_rows-lost)
|
||||
{
|
||||
p[inc+1] = p[inc];
|
||||
inc++;
|
||||
}
|
||||
else
|
||||
{
|
||||
// Check if it should be recoverable
|
||||
// Calculate count of data chunks lost in each group
|
||||
int nsel = 0;
|
||||
for (int gr = 0; gr < groups; gr++)
|
||||
{
|
||||
lost_per_group[gr] = ((gr+1)*(n/groups) > n ? (n - gr*(n/groups)) : n/groups);
|
||||
recovered_per_group[gr] = 0;
|
||||
}
|
||||
for (int j = 0; j < total_rows-lost; j++)
|
||||
{
|
||||
if (p[j] < n)
|
||||
{
|
||||
lost_per_group[(p[j] / (n/groups))]--;
|
||||
selected_inverted[nsel++] = j;
|
||||
}
|
||||
}
|
||||
// Every local parity chunk is supposed to restore 1 missing chunk inside its group
|
||||
// So, subtract local parity chunk counts from each group lost chunk count
|
||||
for (int j = 0; j < total_rows-lost; j++)
|
||||
{
|
||||
if (p[j] >= n && p[j] < n+groups*local)
|
||||
{
|
||||
int gr = (p[j]-n)/local;
|
||||
if (lost_per_group[gr] > recovered_per_group[gr] && nsel < n)
|
||||
{
|
||||
selected_inverted[nsel++] = j;
|
||||
}
|
||||
recovered_per_group[gr]++;
|
||||
}
|
||||
}
|
||||
// Every global parity chunk is supposed to restore 1 chunk of all that are still missing
|
||||
int still_missing = 0;
|
||||
for (int gr = 0; gr < groups; gr++)
|
||||
{
|
||||
int non_fixed = lost_per_group[gr] - recovered_per_group[gr];
|
||||
still_missing += (non_fixed > 0 ? non_fixed : 0);
|
||||
}
|
||||
for (int j = 0; j < total_rows-lost; j++)
|
||||
{
|
||||
if (p[j] >= n+groups*local)
|
||||
{
|
||||
if (still_missing > 0 && nsel < n)
|
||||
{
|
||||
selected_inverted[nsel++] = j;
|
||||
}
|
||||
still_missing--;
|
||||
}
|
||||
}
|
||||
if (still_missing <= 0)
|
||||
{
|
||||
// We hope it can be recoverable. Try to invert it
|
||||
assert(nsel == n);
|
||||
for (int i = 0; i < n; i++)
|
||||
{
|
||||
for (int j = 0; j < n; j++)
|
||||
{
|
||||
erased_matrix[i*n+j] = lrc_matrix[p[selected_inverted[i]]*n+j];
|
||||
}
|
||||
}
|
||||
int invert_ok = jerasure_invert_matrix(erased_matrix, inverted_matrix, n, w);
|
||||
if (invert_ok < 0)
|
||||
{
|
||||
failures++;
|
||||
if (log_level > 0)
|
||||
{
|
||||
printf("\nFAIL: ");
|
||||
for (int i = 0; i < total_rows-lost; i++)
|
||||
{
|
||||
printf("%d ", p[i]);
|
||||
}
|
||||
printf("\nDIRECT:\n");
|
||||
for (int i = 0; i < total_rows-lost; i++)
|
||||
{
|
||||
for (int j = 0; j < n; j++)
|
||||
printf("%d ", lrc_matrix[p[i]*n+j]);
|
||||
printf("\n");
|
||||
}
|
||||
printf("INVERSE:\n");
|
||||
for (int i = 0; i < total_rows-lost; i++)
|
||||
{
|
||||
for (int j = 0; j < n; j++)
|
||||
printf("%d ", inverted_matrix[i*n+j]);
|
||||
printf("\n");
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
success++;
|
||||
if (log_level > 2)
|
||||
{
|
||||
printf("OK: ");
|
||||
for (int i = 0; i < total_rows-lost; i++)
|
||||
{
|
||||
printf("%d ", p[i]);
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
impossible++;
|
||||
if (log_level > 1)
|
||||
{
|
||||
printf("IMPOSSIBLE: ");
|
||||
for (int i = 0; i < total_rows-lost; i++)
|
||||
{
|
||||
printf("%d ", p[i]);
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
free(p2);
|
||||
free(p);
|
||||
free(inverted_matrix);
|
||||
free(erased_matrix);
|
||||
}
|
||||
free(lost_per_group);
|
||||
free(recovered_per_group);
|
||||
return (struct lrc_test_result_t){
|
||||
.success = success,
|
||||
.impossible = impossible,
|
||||
.failures = failures,
|
||||
};
|
||||
}
|
||||
|
||||
int main()
|
||||
{
|
||||
int W = 8, MATRIX_W = 8;
|
||||
int n = 8, groups = 2, local = 1, global = 2;
|
||||
//n = 4, groups = 2, local = 1, global = 1;
|
||||
int total_rows = n+groups*local+global;
|
||||
int *matrix = reed_sol_vandermonde_lrc_matrix(n, groups, local, global, MATRIX_W);
|
||||
int *lrc_matrix = (int*)malloc(sizeof(int) * total_rows*n);
|
||||
// Fill identity+LRC matrix
|
||||
for (int i = 0; i < n; i++)
|
||||
for (int j = 0; j < n; j++)
|
||||
lrc_matrix[i*n + j] = j == i ? 1 : 0;
|
||||
memcpy(lrc_matrix + n*n, matrix, (total_rows-n)*n*sizeof(int));
|
||||
free(matrix);
|
||||
matrix = NULL;
|
||||
// Print LRC matrix
|
||||
for (int i = 0; i < total_rows; i++)
|
||||
{
|
||||
for (int j = 0; j < n; j++)
|
||||
{
|
||||
printf("%d ", lrc_matrix[i*n+j]);
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
struct lrc_test_result_t t = check_mr_lrc(lrc_matrix, n, groups, local, global, W, 1);
|
||||
printf("\n%d recovered, %d impossible, %d failures\n", t.success, t.impossible, t.failures);
|
||||
return 0;
|
||||
}
|
||||
|
||||
// 1 1 1 1 0 0 0 0
|
||||
// 0 0 0 0 1 1 1 1
|
||||
// 1 55 39 73 84 181 225 217
|
||||
// 1 172 70 235 143 34 200 101
|
||||
//
|
||||
// Can't recover
|
||||
// 1 2 4 5 8 9 10 11 -1
|
||||
// 2 3 4 6 8 9 10 11 -1
|
||||
// FULL:
|
||||
// 1 0 0 0 0 0 0 0
|
||||
// 0 1 0 0 0 0 0 0
|
||||
// 0 0 1 0 0 0 0 0
|
||||
// 0 0 0 1 0 0 0 0
|
||||
// 0 0 0 0 1 0 0 0
|
||||
// 0 0 0 0 0 1 0 0
|
||||
// 0 0 0 0 0 0 1 0
|
||||
// 0 0 0 0 0 0 0 1
|
||||
// 1 1 1 1 0 0 0 0
|
||||
// 0 0 0 0 1 1 1 1
|
||||
// 1 55 39 73 84 181 225 217
|
||||
// 1 172 70 235 143 34 200 101
|
||||
// FIRST UNRECOVERABLE:
|
||||
// 0 1 0 0 0 0 0 0
|
||||
// 0 0 1 0 0 0 0 0
|
||||
// 0 0 0 0 1 0 0 0
|
||||
// 0 0 0 0 0 1 0 0
|
||||
// 1 1 1 1 0 0 0 0
|
||||
// 0 0 0 0 1 1 1 1
|
||||
// 1 55 39 73 84 181 225 217
|
||||
// 1 172 70 235 143 34 200 101
|
||||
// SECOND UNRECOVERABLE:
|
||||
// 0 0 1 0 0 0 0 0
|
||||
// 0 0 0 1 0 0 0 0
|
||||
// 0 0 0 0 1 0 0 0
|
||||
// 0 0 0 0 0 0 1 0
|
||||
// 1 1 1 1 0 0 0 0
|
||||
// 0 0 0 0 1 1 1 1
|
||||
// 1 55 39 73 84 181 225 217
|
||||
// 1 172 70 235 143 34 200 101
|
||||
// Ho ho ho
|
@@ -945,7 +945,7 @@ void calc_rmw_parity_ec(osd_rmw_stripe_t *stripes, int pg_size, int pg_minsize,
|
||||
{
|
||||
if (write_osd_set[i])
|
||||
{
|
||||
memcpy(subm + item_size*pg_minsize*j, matrix_data + item_size*pg_minsize*(i-pg_minsize), item_size*pg_minsize);
|
||||
memcpy((uint8_t*)subm + item_size*pg_minsize*j, (uint8_t*)matrix_data + item_size*pg_minsize*(i-pg_minsize), item_size*pg_minsize);
|
||||
j++;
|
||||
}
|
||||
}
|
||||
|
@@ -53,6 +53,7 @@ typedef struct VitastorClient
|
||||
char *etcd_host;
|
||||
char *etcd_prefix;
|
||||
char *image;
|
||||
int skip_parents;
|
||||
uint64_t inode;
|
||||
uint64_t pool;
|
||||
uint64_t size;
|
||||
@@ -63,6 +64,10 @@ typedef struct VitastorClient
|
||||
int rdma_gid_index;
|
||||
int rdma_mtu;
|
||||
QemuMutex mutex;
|
||||
|
||||
uint64_t last_bitmap_inode, last_bitmap_offset, last_bitmap_len;
|
||||
uint32_t last_bitmap_granularity;
|
||||
uint8_t *last_bitmap;
|
||||
} VitastorClient;
|
||||
|
||||
typedef struct VitastorRPC
|
||||
@@ -72,6 +77,9 @@ typedef struct VitastorRPC
|
||||
QEMUIOVector *iov;
|
||||
long ret;
|
||||
int complete;
|
||||
uint64_t inode, offset, len;
|
||||
uint32_t bitmap_granularity;
|
||||
uint8_t *bitmap;
|
||||
} VitastorRPC;
|
||||
|
||||
static void vitastor_co_init_task(BlockDriverState *bs, VitastorRPC *task);
|
||||
@@ -147,6 +155,7 @@ static void vitastor_parse_filename(const char *filename, QDict *options, Error
|
||||
if (!strcmp(name, "inode") ||
|
||||
!strcmp(name, "pool") ||
|
||||
!strcmp(name, "size") ||
|
||||
!strcmp(name, "skip-parents") ||
|
||||
!strcmp(name, "use-rdma") ||
|
||||
!strcmp(name, "rdma-port_num") ||
|
||||
!strcmp(name, "rdma-gid-index") ||
|
||||
@@ -227,13 +236,16 @@ static void vitastor_aio_set_fd_handler(void *ctx, int fd, int unused1, IOHandle
|
||||
|
||||
static int vitastor_file_open(BlockDriverState *bs, QDict *options, int flags, Error **errp)
|
||||
{
|
||||
VitastorRPC task;
|
||||
VitastorClient *client = bs->opaque;
|
||||
void *image = NULL;
|
||||
int64_t ret = 0;
|
||||
qemu_mutex_init(&client->mutex);
|
||||
client->config_path = g_strdup(qdict_get_try_str(options, "config-path"));
|
||||
// FIXME: Rename to etcd_address
|
||||
client->etcd_host = g_strdup(qdict_get_try_str(options, "etcd-host"));
|
||||
client->etcd_prefix = g_strdup(qdict_get_try_str(options, "etcd-prefix"));
|
||||
client->skip_parents = qdict_get_try_int(options, "skip-parents", 0);
|
||||
client->use_rdma = qdict_get_try_int(options, "use-rdma", -1);
|
||||
client->rdma_device = g_strdup(qdict_get_try_str(options, "rdma-device"));
|
||||
client->rdma_port_num = qdict_get_try_int(options, "rdma-port-num", 0);
|
||||
@@ -243,23 +255,25 @@ static int vitastor_file_open(BlockDriverState *bs, QDict *options, int flags, E
|
||||
vitastor_aio_set_fd_handler, bdrv_get_aio_context(bs), client->config_path, client->etcd_host, client->etcd_prefix,
|
||||
client->use_rdma, client->rdma_device, client->rdma_port_num, client->rdma_gid_index, client->rdma_mtu, 0
|
||||
);
|
||||
client->image = g_strdup(qdict_get_try_str(options, "image"));
|
||||
image = client->image = g_strdup(qdict_get_try_str(options, "image"));
|
||||
client->readonly = (flags & BDRV_O_RDWR) ? 1 : 0;
|
||||
// Get image metadata (size and readonly flag) or just wait until the client is ready
|
||||
if (!image)
|
||||
client->image = (char*)"x";
|
||||
task.complete = 0;
|
||||
task.bs = bs;
|
||||
if (qemu_in_coroutine())
|
||||
{
|
||||
vitastor_co_get_metadata(&task);
|
||||
}
|
||||
else
|
||||
{
|
||||
bdrv_coroutine_enter(bs, qemu_coroutine_create((void(*)(void*))vitastor_co_get_metadata, &task));
|
||||
BDRV_POLL_WHILE(bs, !task.complete);
|
||||
}
|
||||
client->image = image;
|
||||
if (client->image)
|
||||
{
|
||||
// Get image metadata (size and readonly flag)
|
||||
VitastorRPC task;
|
||||
task.complete = 0;
|
||||
task.bs = bs;
|
||||
if (qemu_in_coroutine())
|
||||
{
|
||||
vitastor_co_get_metadata(&task);
|
||||
}
|
||||
else
|
||||
{
|
||||
bdrv_coroutine_enter(bs, qemu_coroutine_create((void(*)(void*))vitastor_co_get_metadata, &task));
|
||||
BDRV_POLL_WHILE(bs, !task.complete);
|
||||
}
|
||||
client->watch = (void*)task.ret;
|
||||
client->readonly = client->readonly || vitastor_c_inode_get_readonly(client->watch);
|
||||
client->size = vitastor_c_inode_get_size(client->watch);
|
||||
@@ -284,6 +298,7 @@ static int vitastor_file_open(BlockDriverState *bs, QDict *options, int flags, E
|
||||
client->inode = (client->inode & (((uint64_t)1 << (64-POOL_ID_BITS)) - 1)) | (client->pool << (64-POOL_ID_BITS));
|
||||
}
|
||||
client->size = qdict_get_try_int(options, "size", 0);
|
||||
vitastor_c_close_watch(client->proxy, (void*)task.ret);
|
||||
}
|
||||
if (!client->size)
|
||||
{
|
||||
@@ -305,6 +320,7 @@ static int vitastor_file_open(BlockDriverState *bs, QDict *options, int flags, E
|
||||
qdict_del(options, "inode");
|
||||
qdict_del(options, "pool");
|
||||
qdict_del(options, "size");
|
||||
qdict_del(options, "skip-parents");
|
||||
return ret;
|
||||
}
|
||||
|
||||
@@ -321,6 +337,8 @@ static void vitastor_close(BlockDriverState *bs)
|
||||
g_free(client->etcd_prefix);
|
||||
if (client->image)
|
||||
g_free(client->image);
|
||||
free(client->last_bitmap);
|
||||
client->last_bitmap = NULL;
|
||||
}
|
||||
|
||||
#if QEMU_VERSION_MAJOR >= 3 || QEMU_VERSION_MAJOR == 2 && QEMU_VERSION_MINOR > 2
|
||||
@@ -486,6 +504,13 @@ static int coroutine_fn vitastor_co_pwritev(BlockDriverState *bs,
|
||||
vitastor_co_init_task(bs, &task);
|
||||
task.iov = iov;
|
||||
|
||||
if (client->last_bitmap)
|
||||
{
|
||||
// Invalidate last bitmap on write
|
||||
free(client->last_bitmap);
|
||||
client->last_bitmap = NULL;
|
||||
}
|
||||
|
||||
uint64_t inode = client->watch ? vitastor_c_inode_get_num(client->watch) : client->inode;
|
||||
qemu_mutex_lock(&client->mutex);
|
||||
vitastor_c_write(client->proxy, inode, offset, bytes, 0, iov->iov, iov->niov, vitastor_co_generic_bh_cb, &task);
|
||||
@@ -499,6 +524,140 @@ static int coroutine_fn vitastor_co_pwritev(BlockDriverState *bs,
|
||||
return task.ret;
|
||||
}
|
||||
|
||||
#if defined VITASTOR_C_API_VERSION && VITASTOR_C_API_VERSION >= 1
|
||||
#if QEMU_VERSION_MAJOR >= 2 || QEMU_VERSION_MAJOR == 1 && QEMU_VERSION_MINOR >= 7
|
||||
static void vitastor_co_read_bitmap_cb(void *opaque, long retval, uint8_t *bitmap)
|
||||
{
|
||||
VitastorRPC *task = opaque;
|
||||
VitastorClient *client = task->bs->opaque;
|
||||
task->ret = retval;
|
||||
task->complete = 1;
|
||||
if (retval >= 0)
|
||||
{
|
||||
task->bitmap = bitmap;
|
||||
if (client->last_bitmap_inode == task->inode &&
|
||||
client->last_bitmap_offset == task->offset &&
|
||||
client->last_bitmap_len == task->len)
|
||||
{
|
||||
free(client->last_bitmap);
|
||||
client->last_bitmap = bitmap;
|
||||
}
|
||||
}
|
||||
if (qemu_coroutine_self() != task->co)
|
||||
{
|
||||
#if QEMU_VERSION_MAJOR >= 3 || QEMU_VERSION_MAJOR == 2 && QEMU_VERSION_MINOR > 8
|
||||
aio_co_wake(task->co);
|
||||
#else
|
||||
qemu_coroutine_enter(task->co, NULL);
|
||||
qemu_aio_release(task);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
static int coroutine_fn vitastor_co_block_status(
|
||||
BlockDriverState *bs, bool want_zero, int64_t offset, int64_t bytes,
|
||||
int64_t *pnum, int64_t *map, BlockDriverState **file)
|
||||
{
|
||||
// Allocated => return BDRV_BLOCK_DATA|BDRV_BLOCK_OFFSET_VALID
|
||||
// Not allocated => return 0
|
||||
// Error => return -errno
|
||||
// Set pnum to length of the extent, `*map` = `offset`, `*file` = `bs`
|
||||
VitastorRPC task;
|
||||
VitastorClient *client = bs->opaque;
|
||||
uint64_t inode = client->watch ? vitastor_c_inode_get_num(client->watch) : client->inode;
|
||||
uint8_t bit = 0;
|
||||
if (client->last_bitmap && client->last_bitmap_inode == inode &&
|
||||
client->last_bitmap_offset <= offset &&
|
||||
client->last_bitmap_offset+client->last_bitmap_len >= (want_zero ? offset+1 : offset+bytes))
|
||||
{
|
||||
// Use the previously read bitmap
|
||||
task.bitmap_granularity = client->last_bitmap_granularity;
|
||||
task.offset = client->last_bitmap_offset;
|
||||
task.len = client->last_bitmap_len;
|
||||
task.bitmap = client->last_bitmap;
|
||||
}
|
||||
else
|
||||
{
|
||||
// Read bitmap from this position, rounding to full inode PG blocks
|
||||
uint32_t block_size = vitastor_c_inode_get_block_size(client->proxy, inode);
|
||||
if (!block_size)
|
||||
return -EAGAIN;
|
||||
// Init coroutine
|
||||
vitastor_co_init_task(bs, &task);
|
||||
free(client->last_bitmap);
|
||||
task.inode = client->last_bitmap_inode = inode;
|
||||
task.bitmap_granularity = client->last_bitmap_granularity = vitastor_c_inode_get_bitmap_granularity(client->proxy, inode);
|
||||
task.offset = client->last_bitmap_offset = offset / block_size * block_size;
|
||||
task.len = client->last_bitmap_len = (offset+bytes+block_size-1) / block_size * block_size - task.offset;
|
||||
task.bitmap = client->last_bitmap = NULL;
|
||||
qemu_mutex_lock(&client->mutex);
|
||||
vitastor_c_read_bitmap(client->proxy, task.inode, task.offset, task.len, !client->skip_parents, vitastor_co_read_bitmap_cb, &task);
|
||||
qemu_mutex_unlock(&client->mutex);
|
||||
while (!task.complete)
|
||||
{
|
||||
qemu_coroutine_yield();
|
||||
}
|
||||
if (task.ret < 0)
|
||||
{
|
||||
// Error
|
||||
return task.ret;
|
||||
}
|
||||
}
|
||||
if (want_zero)
|
||||
{
|
||||
// Get precise mapping with all holes
|
||||
uint64_t bmp_pos = (offset-task.offset) / task.bitmap_granularity;
|
||||
uint64_t bmp_len = task.len / task.bitmap_granularity;
|
||||
uint64_t bmp_end = bmp_pos+1;
|
||||
bit = (task.bitmap[bmp_pos >> 3] >> (bmp_pos & 0x7)) & 1;
|
||||
while (bmp_end < bmp_len && ((task.bitmap[bmp_end >> 3] >> (bmp_end & 0x7)) & 1) == bit)
|
||||
{
|
||||
bmp_end++;
|
||||
}
|
||||
*pnum = (bmp_end-bmp_pos) * task.bitmap_granularity;
|
||||
}
|
||||
else
|
||||
{
|
||||
// Get larger allocated extents, possibly with false positives
|
||||
uint64_t bmp_pos = (offset-task.offset) / task.bitmap_granularity;
|
||||
uint64_t bmp_end = (offset+bytes-task.offset) / task.bitmap_granularity - bmp_pos;
|
||||
while (bmp_pos < bmp_end)
|
||||
{
|
||||
if (!(bmp_pos & 7) && bmp_end >= bmp_pos+8)
|
||||
{
|
||||
bit = bit || task.bitmap[bmp_pos >> 3];
|
||||
bmp_pos += 8;
|
||||
}
|
||||
else
|
||||
{
|
||||
bit = bit || ((task.bitmap[bmp_pos >> 3] >> (bmp_pos & 0x7)) & 1);
|
||||
bmp_pos++;
|
||||
}
|
||||
}
|
||||
*pnum = bytes;
|
||||
}
|
||||
if (bit)
|
||||
{
|
||||
*map = offset;
|
||||
*file = bs;
|
||||
}
|
||||
return (bit ? (BDRV_BLOCK_DATA|BDRV_BLOCK_OFFSET_VALID) : 0);
|
||||
}
|
||||
#endif
|
||||
#if QEMU_VERSION_MAJOR == 1 && QEMU_VERSION_MINOR >= 7 || QEMU_VERSION_MAJOR == 2 && QEMU_VERSION_MINOR < 12
|
||||
// QEMU 1.7-2.11
|
||||
static int64_t coroutine_fn vitastor_co_get_block_status(BlockDriverState *bs,
|
||||
int64_t sector_num, int nb_sectors, int *pnum, BlockDriverState **file)
|
||||
{
|
||||
int64_t map = 0;
|
||||
int64_t pnumbytes = 0;
|
||||
int r = vitastor_co_block_status(bs, 1, sector_num*BDRV_SECTOR_SIZE, nb_sectors*BDRV_SECTOR_SIZE, &pnumbytes, &map, &file);
|
||||
*pnum = pnumbytes/BDRV_SECTOR_SIZE;
|
||||
return r;
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if !( QEMU_VERSION_MAJOR >= 3 || QEMU_VERSION_MAJOR == 2 && QEMU_VERSION_MINOR >= 7 )
|
||||
static int coroutine_fn vitastor_co_readv(BlockDriverState *bs, int64_t sector_num, int nb_sectors, QEMUIOVector *iov)
|
||||
{
|
||||
@@ -606,6 +765,15 @@ static BlockDriver bdrv_vitastor = {
|
||||
.bdrv_co_truncate = vitastor_co_truncate,
|
||||
#endif
|
||||
|
||||
#if defined VITASTOR_C_API_VERSION && VITASTOR_C_API_VERSION >= 1
|
||||
#if QEMU_VERSION_MAJOR >= 3 || QEMU_VERSION_MAJOR == 2 && QEMU_VERSION_MINOR >= 12
|
||||
// For snapshot export
|
||||
.bdrv_co_block_status = vitastor_co_block_status,
|
||||
#elif QEMU_VERSION_MAJOR == 1 && QEMU_VERSION_MINOR >= 7 || QEMU_VERSION_MAJOR == 2 && QEMU_VERSION_MINOR < 12
|
||||
.bdrv_co_get_block_status = vitastor_co_get_block_status,
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if QEMU_VERSION_MAJOR >= 3 || QEMU_VERSION_MAJOR == 2 && QEMU_VERSION_MINOR >= 7
|
||||
.bdrv_co_preadv = vitastor_co_preadv,
|
||||
.bdrv_co_pwritev = vitastor_co_pwritev,
|
||||
|
@@ -6,7 +6,7 @@ includedir=${prefix}/@CMAKE_INSTALL_INCLUDEDIR@
|
||||
|
||||
Name: Vitastor
|
||||
Description: Vitastor client library
|
||||
Version: 0.8.3
|
||||
Version: 0.8.4
|
||||
Libs: -L${libdir} -lvitastor_client
|
||||
Cflags: -I${includedir}
|
||||
|
||||
|
@@ -207,6 +207,28 @@ void vitastor_c_write(vitastor_c *client, uint64_t inode, uint64_t offset, uint6
|
||||
client->cli->execute(op);
|
||||
}
|
||||
|
||||
void vitastor_c_read_bitmap(vitastor_c *client, uint64_t inode, uint64_t offset, uint64_t len,
|
||||
int with_parents, VitastorReadBitmapHandler cb, void *opaque)
|
||||
{
|
||||
cluster_op_t *op = new cluster_op_t;
|
||||
op->opcode = with_parents ? OSD_OP_READ_CHAIN_BITMAP : OSD_OP_READ_BITMAP;
|
||||
op->inode = inode;
|
||||
op->offset = offset;
|
||||
op->len = len;
|
||||
op->callback = [cb, opaque](cluster_op_t *op)
|
||||
{
|
||||
uint8_t *bitmap = NULL;
|
||||
if (op->retval >= 0)
|
||||
{
|
||||
bitmap = (uint8_t*)op->bitmap_buf;
|
||||
op->bitmap_buf = NULL;
|
||||
}
|
||||
cb(opaque, op->retval, bitmap);
|
||||
delete op;
|
||||
};
|
||||
client->cli->execute(op);
|
||||
}
|
||||
|
||||
void vitastor_c_sync(vitastor_c *client, VitastorIOHandler cb, void *opaque)
|
||||
{
|
||||
cluster_op_t *op = new cluster_op_t;
|
||||
@@ -245,6 +267,25 @@ uint64_t vitastor_c_inode_get_num(void *handle)
|
||||
return watch->cfg.num;
|
||||
}
|
||||
|
||||
uint32_t vitastor_c_inode_get_block_size(vitastor_c *client, uint64_t inode_num)
|
||||
{
|
||||
auto pool_it = client->cli->st_cli.pool_config.find(INODE_POOL(inode_num));
|
||||
if (pool_it == client->cli->st_cli.pool_config.end())
|
||||
return 0;
|
||||
auto & pool_cfg = pool_it->second;
|
||||
uint32_t pg_data_size = (pool_cfg.scheme == POOL_SCHEME_REPLICATED ? 1 : pool_cfg.pg_size-pool_cfg.parity_chunks);
|
||||
return pool_cfg.data_block_size * pg_data_size;
|
||||
}
|
||||
|
||||
uint32_t vitastor_c_inode_get_bitmap_granularity(vitastor_c *client, uint64_t inode_num)
|
||||
{
|
||||
auto pool_it = client->cli->st_cli.pool_config.find(INODE_POOL(inode_num));
|
||||
if (pool_it == client->cli->st_cli.pool_config.end())
|
||||
return 0;
|
||||
// FIXME: READ_BITMAP may fails if parent bitmap granularity differs from inode bitmap granularity
|
||||
return pool_it->second.bitmap_granularity;
|
||||
}
|
||||
|
||||
int vitastor_c_inode_get_readonly(void *handle)
|
||||
{
|
||||
inode_watch_t *watch = (inode_watch_t*)handle;
|
||||
|
@@ -6,6 +6,9 @@
|
||||
#ifndef VITASTOR_QEMU_PROXY_H
|
||||
#define VITASTOR_QEMU_PROXY_H
|
||||
|
||||
// C API wrapper version
|
||||
#define VITASTOR_C_API_VERSION 1
|
||||
|
||||
#ifndef POOL_ID_BITS
|
||||
#define POOL_ID_BITS 16
|
||||
#endif
|
||||
@@ -21,6 +24,7 @@ typedef struct vitastor_c vitastor_c;
|
||||
|
||||
typedef void VitastorReadHandler(void *opaque, long retval, uint64_t version);
|
||||
typedef void VitastorIOHandler(void *opaque, long retval);
|
||||
typedef void VitastorReadBitmapHandler(void *opaque, long retval, uint8_t *bitmap);
|
||||
|
||||
// QEMU
|
||||
typedef void IOHandler(void *opaque);
|
||||
@@ -42,11 +46,15 @@ void vitastor_c_read(vitastor_c *client, uint64_t inode, uint64_t offset, uint64
|
||||
struct iovec *iov, int iovcnt, VitastorReadHandler cb, void *opaque);
|
||||
void vitastor_c_write(vitastor_c *client, uint64_t inode, uint64_t offset, uint64_t len, uint64_t check_version,
|
||||
struct iovec *iov, int iovcnt, VitastorIOHandler cb, void *opaque);
|
||||
void vitastor_c_read_bitmap(vitastor_c *client, uint64_t inode, uint64_t offset, uint64_t len,
|
||||
int with_parents, VitastorReadBitmapHandler cb, void *opaque);
|
||||
void vitastor_c_sync(vitastor_c *client, VitastorIOHandler cb, void *opaque);
|
||||
void vitastor_c_watch_inode(vitastor_c *client, char *image, VitastorIOHandler cb, void *opaque);
|
||||
void vitastor_c_close_watch(vitastor_c *client, void *handle);
|
||||
uint64_t vitastor_c_inode_get_size(void *handle);
|
||||
uint64_t vitastor_c_inode_get_num(void *handle);
|
||||
uint32_t vitastor_c_inode_get_block_size(vitastor_c *client, uint64_t inode_num);
|
||||
uint32_t vitastor_c_inode_get_bitmap_granularity(vitastor_c *client, uint64_t inode_num);
|
||||
int vitastor_c_inode_get_readonly(void *handle);
|
||||
|
||||
#ifdef __cplusplus
|
||||
|
@@ -22,6 +22,16 @@ LD_PRELOAD="build/src/libfio_vitastor.so" \
|
||||
LD_PRELOAD="build/src/libfio_vitastor.so" \
|
||||
fio -thread -name=test -ioengine=build/src/libfio_vitastor.so -bs=4M -direct=1 -iodepth=1 -rw=read -etcd=$ETCD_URL -pool=1 -inode=3 -size=32M
|
||||
|
||||
qemu-img convert -p \
|
||||
-f raw "vitastor:etcd_host=127.0.0.1\:$ETCD_PORT/v3:pool=1:inode=2:size=$((32*1024*1024)):skip-parents=1" \
|
||||
-O qcow2 ./testdata/layer0.qcow2
|
||||
|
||||
qemu-img create -f qcow2 ./testdata/empty.qcow2 32M
|
||||
|
||||
qemu-img convert -p \
|
||||
-f raw "vitastor:etcd_host=127.0.0.1\:$ETCD_PORT/v3:pool=1:inode=3:size=$((32*1024*1024)):skip-parents=1" \
|
||||
-O qcow2 -o 'cluster_size=4k' -B empty.qcow2 ./testdata/layer1.qcow2
|
||||
|
||||
qemu-img convert -S 4096 -p \
|
||||
-f raw "vitastor:etcd_host=127.0.0.1\:$ETCD_PORT/v3:pool=1:inode=3:size=$((32*1024*1024))" \
|
||||
-O raw ./testdata/merged.bin
|
||||
@@ -52,4 +62,18 @@ qemu-img convert -S 4096 -p \
|
||||
|
||||
cmp ./testdata/merged.bin ./testdata/merged-by-tool.bin
|
||||
|
||||
# Test merge by qemu-img
|
||||
|
||||
qemu-img rebase -u -b layer0.qcow2 ./testdata/layer1.qcow2
|
||||
|
||||
qemu-img convert -S 4096 -f qcow2 ./testdata/layer1.qcow2 -O raw ./testdata/rebased.bin
|
||||
|
||||
cmp ./testdata/merged.bin ./testdata/rebased.bin
|
||||
|
||||
qemu-img rebase -u -b '' ./testdata/layer1.qcow2
|
||||
|
||||
qemu-img convert -S 4096 -f qcow2 ./testdata/layer1.qcow2 -O raw ./testdata/rebased.bin
|
||||
|
||||
cmp ./testdata/layer1.bin ./testdata/rebased.bin
|
||||
|
||||
format_green OK
|
||||
|
Reference in New Issue
Block a user