Compare commits
2 Commits
test-etcd-
...
cow-meta
Author | SHA1 | Date | |
---|---|---|---|
2f3b1f37a2 | |||
abd5cbfbe4 |
@@ -684,24 +684,6 @@ jobs:
|
||||
echo ""
|
||||
done
|
||||
|
||||
test_write_iothreads:
|
||||
runs-on: ubuntu-latest
|
||||
needs: build
|
||||
container: ${{env.TEST_IMAGE}}:${{github.sha}}
|
||||
steps:
|
||||
- name: Run test
|
||||
id: test
|
||||
timeout-minutes: 3
|
||||
run: TEST_NAME=iothreads GLOBAL_CONFIG=',"client_iothread_count":4' /root/vitastor/tests/test_write.sh
|
||||
- name: Print logs
|
||||
if: always() && steps.test.outcome == 'failure'
|
||||
run: |
|
||||
for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
|
||||
echo "-------- $i --------"
|
||||
cat $i
|
||||
echo ""
|
||||
done
|
||||
|
||||
test_write_no_same:
|
||||
runs-on: ubuntu-latest
|
||||
needs: build
|
||||
|
@@ -2,6 +2,6 @@ cmake_minimum_required(VERSION 2.8.12)
|
||||
|
||||
project(vitastor)
|
||||
|
||||
set(VITASTOR_VERSION "2.2.2")
|
||||
set(VITASTOR_VERSION "2.2.0")
|
||||
|
||||
add_subdirectory(src)
|
||||
|
Submodule cpp-btree updated: 8de8b467ac...a21350e484
@@ -1,4 +1,4 @@
|
||||
VITASTOR_VERSION ?= v2.2.2
|
||||
VITASTOR_VERSION ?= v2.2.0
|
||||
|
||||
all: build push
|
||||
|
||||
|
@@ -49,7 +49,7 @@ spec:
|
||||
capabilities:
|
||||
add: ["SYS_ADMIN"]
|
||||
allowPrivilegeEscalation: true
|
||||
image: vitalif/vitastor-csi:v2.2.2
|
||||
image: vitalif/vitastor-csi:v2.2.0
|
||||
args:
|
||||
- "--node=$(NODE_ID)"
|
||||
- "--endpoint=$(CSI_ENDPOINT)"
|
||||
|
@@ -121,7 +121,7 @@ spec:
|
||||
privileged: true
|
||||
capabilities:
|
||||
add: ["SYS_ADMIN"]
|
||||
image: vitalif/vitastor-csi:v2.2.2
|
||||
image: vitalif/vitastor-csi:v2.2.0
|
||||
args:
|
||||
- "--node=$(NODE_ID)"
|
||||
- "--endpoint=$(CSI_ENDPOINT)"
|
||||
|
@@ -5,7 +5,7 @@ package vitastor
|
||||
|
||||
const (
|
||||
vitastorCSIDriverName = "csi.vitastor.io"
|
||||
vitastorCSIDriverVersion = "2.2.2"
|
||||
vitastorCSIDriverVersion = "2.2.0"
|
||||
)
|
||||
|
||||
// Config struct fills the parameters of request or user input
|
||||
|
2
debian/changelog
vendored
2
debian/changelog
vendored
@@ -1,4 +1,4 @@
|
||||
vitastor (2.2.2-1) unstable; urgency=medium
|
||||
vitastor (2.2.0-1) unstable; urgency=medium
|
||||
|
||||
* Bugfixes
|
||||
|
||||
|
@@ -1,4 +1,4 @@
|
||||
VITASTOR_VERSION ?= v2.2.2
|
||||
VITASTOR_VERSION ?= v2.2.0
|
||||
|
||||
all: build push
|
||||
|
||||
|
@@ -4,7 +4,7 @@
|
||||
#
|
||||
|
||||
# Desired Vitastor version
|
||||
VITASTOR_VERSION=v2.2.2
|
||||
VITASTOR_VERSION=v2.2.0
|
||||
|
||||
# Additional arguments for all containers
|
||||
# For example, you may want to specify a custom logging driver here
|
||||
|
@@ -26,9 +26,9 @@ at Vitastor Kubernetes operator: https://github.com/Antilles7227/vitastor-operat
|
||||
The instruction is very simple.
|
||||
|
||||
1. Download a Docker image of the desired version: \
|
||||
`docker pull vitastor:v2.2.2`
|
||||
`docker pull vitastor:v2.2.0`
|
||||
2. Install scripts to the host system: \
|
||||
`docker run --rm -it -v /etc:/host-etc -v /usr/bin:/host-bin vitastor:v2.2.2 install.sh`
|
||||
`docker run --rm -it -v /etc:/host-etc -v /usr/bin:/host-bin vitastor:v2.2.0 install.sh`
|
||||
3. Reload udev rules: \
|
||||
`udevadm control --reload-rules`
|
||||
|
||||
|
@@ -25,9 +25,9 @@ Vitastor можно установить в Docker/Podman. При этом etcd,
|
||||
Инструкция по установке максимально простая.
|
||||
|
||||
1. Скачайте Docker-образ желаемой версии: \
|
||||
`docker pull vitastor:v2.2.2`
|
||||
`docker pull vitastor:v2.2.0`
|
||||
2. Установите скрипты в хост-систему командой: \
|
||||
`docker run --rm -it -v /etc:/host-etc -v /usr/bin:/host-bin vitastor:v2.2.2 install.sh`
|
||||
`docker run --rm -it -v /etc:/host-etc -v /usr/bin:/host-bin vitastor:v2.2.0 install.sh`
|
||||
3. Перезагрузите правила udev: \
|
||||
`udevadm control --reload-rules`
|
||||
|
||||
|
@@ -6,10 +6,10 @@
|
||||
|
||||
# Proxmox VE
|
||||
|
||||
To enable Vitastor support in Proxmox Virtual Environment (6.4-8.x are supported):
|
||||
To enable Vitastor support in Proxmox Virtual Environment (6.4-8.1 are supported):
|
||||
|
||||
- Add the corresponding Vitastor Debian repository into sources.list on Proxmox hosts:
|
||||
bookworm for 8.1+, pve8.0 for 8.0, bullseye for 7.4, pve7.3 for 7.3, pve7.2 for 7.2, pve7.1 for 7.1, buster for 6.4
|
||||
bookworm for 8.1, pve8.0 for 8.0, bullseye for 7.4, pve7.3 for 7.3, pve7.2 for 7.2, pve7.1 for 7.1, buster for 6.4
|
||||
- Install vitastor-client, pve-qemu-kvm, pve-storage-vitastor (* or see note) packages from Vitastor repository
|
||||
- Define storage in `/etc/pve/storage.cfg` (see below)
|
||||
- Block network access from VMs to Vitastor network (to OSDs and etcd),
|
||||
|
@@ -6,10 +6,10 @@
|
||||
|
||||
# Proxmox VE
|
||||
|
||||
Чтобы подключить Vitastor к Proxmox Virtual Environment (поддерживаются версии 6.4-8.x):
|
||||
Чтобы подключить Vitastor к Proxmox Virtual Environment (поддерживаются версии 6.4-8.1):
|
||||
|
||||
- Добавьте соответствующий Debian-репозиторий Vitastor в sources.list на хостах Proxmox:
|
||||
bookworm для 8.1+, pve8.0 для 8.0, bullseye для 7.4, pve7.3 для 7.3, pve7.2 для 7.2, pve7.1 для 7.1, buster для 6.4
|
||||
bookworm для 8.1, pve8.0 для 8.0, bullseye для 7.4, pve7.3 для 7.3, pve7.2 для 7.2, pve7.1 для 7.1, buster для 6.4
|
||||
- Установите пакеты vitastor-client, pve-qemu-kvm, pve-storage-vitastor (* или см. сноску) из репозитория Vitastor
|
||||
- Определите тип хранилища в `/etc/pve/storage.cfg` (см. ниже)
|
||||
- Обязательно заблокируйте доступ от виртуальных машин к сети Vitastor (OSD и etcd), т.к. Vitastor (пока) не поддерживает аутентификацию
|
||||
|
@@ -10,17 +10,8 @@ Copyright (c) Vitaliy Filippov (vitalif [at] yourcmc.ru), 2019+
|
||||
|
||||
Join Vitastor Telegram Chat: https://t.me/vitastor
|
||||
|
||||
License: VNPL 1.1 for server-side code and dual VNPL 1.1 + GPL 2.0+ for client tools.
|
||||
|
||||
Server-side code is licensed only under the terms of VNPL.
|
||||
|
||||
Client libraries (cluster_client and so on) are dual-licensed under the same
|
||||
VNPL 1.1 and also GNU GPL 2.0 or later to allow for compatibility with GPLed
|
||||
software like QEMU and fio.
|
||||
|
||||
## VNPL
|
||||
|
||||
Vitastor Network Public License 1.1 (VNPL 1.1) is a copyleft license based on
|
||||
All server-side code (OSD, Monitor and so on) is licensed under the terms of
|
||||
Vitastor Network Public License 1.1 (VNPL 1.1), a copyleft license based on
|
||||
GNU GPLv3.0 with the additional "Network Interaction" clause which requires
|
||||
opensourcing all programs directly or indirectly interacting with Vitastor
|
||||
through a computer network and expressly designed to be used in conjunction
|
||||
@@ -29,83 +20,18 @@ the terms of the same license, but also under the terms of any GPL-Compatible
|
||||
Free Software License, as listed by the Free Software Foundation.
|
||||
This is a stricter copyleft license than the Affero GPL.
|
||||
|
||||
The idea of VNPL is, in addition to modules linked to Vitastor code in a single
|
||||
binary file, to extend copyleft action to micro-service modules only interacting
|
||||
with it over the network.
|
||||
Please note that VNPL doesn't require you to open the code of proprietary
|
||||
software running inside a VM if it's not specially designed to be used with
|
||||
Vitastor.
|
||||
|
||||
Basically, you can't use the software in a proprietary environment to provide
|
||||
its functionality to users without opensourcing all intermediary components
|
||||
standing between the user and Vitastor or purchasing a commercial license
|
||||
from the author 😀.
|
||||
|
||||
At the same time, VNPL doesn't impose any restrictions on software *not specially designed*
|
||||
to be used with Vitastor, for example, on Windows running inside a VM with a Vitastor disk.
|
||||
Client libraries (cluster_client and so on) are dual-licensed under the same
|
||||
VNPL 1.1 and also GNU GPL 2.0 or later to allow for compatibility with GPLed
|
||||
software like QEMU and fio.
|
||||
|
||||
## Explanation
|
||||
|
||||
Network copyleft is governed by the clause **13. Remote Network Interaction** of VNPL.
|
||||
|
||||
A program is considered to be a "Proxy Program" if it meets both conditions:
|
||||
- It is specially designed to be used with Vitastor. Basically, it means that the program
|
||||
has any functionality specific to Vitastor and thus "knows" that it works with Vitastor,
|
||||
not with something random.
|
||||
- It interacts with Vitastor directly or indirectly through any programming interface,
|
||||
including API, CLI, network or any wrapper (also considered a Proxy Program itself).
|
||||
|
||||
If, in addition to that:
|
||||
- You give any user an apportunity to interact with Vitastor directly or indirectly through
|
||||
any computer interface including the network or any number of wrappers (Proxy Programs).
|
||||
|
||||
Then VNPL requires you to publish the code of all above Proxy Programs to all above users
|
||||
under the terms of any GPL-compatible license - that is, GPL, LGPL, MIT/BSD or Apache 2,
|
||||
because "GPL compatibility" is treated as an ability to legally include licensed code in
|
||||
a GPL application.
|
||||
|
||||
So, if you have a "Proxy Program", but it's not open to the user who directly or indirectly
|
||||
interacts with Vitastor - you are forbidden to use Vitastor under the terms of VNPL and you
|
||||
need a commercial license which doesn't contain open-source requirements.
|
||||
|
||||
## Examples
|
||||
|
||||
- Vitastor Kubernetes CSI driver which creates PersistentVolumes by calling `vitastor-cli create`.
|
||||
- Yes, it interacts with Vitastor through vitastor-cli.
|
||||
- Yes, it is designed specially for use with Vitastor (it has no sense otherwise).
|
||||
- So, CSI driver **definitely IS** a Proxy Program and must be published under the terms of
|
||||
a free software license.
|
||||
- Windows, installed in a VM with the system disk on Vitastor storage.
|
||||
- Yes, it interacts with Vitastor indirectly - it reads and writes data through the block
|
||||
device interface, emulated by QEMU.
|
||||
- No, it definitely isn't designed specially for use with Vitastor - Windows was created long
|
||||
ago before Vitastor and doesn't know anything about it.
|
||||
- So, Windows **definitely IS NOT** a Proxy Program and VNPL doesn't require to open it.
|
||||
- Cloud control panel which makes requests to Vitastor Kubernetes CSI driver.
|
||||
- Yes, it interacts with Vitastor indirectly through the CSI driver, which is a Proxy Program.
|
||||
- May or may not be designed specially for use with Vitastor. How to determine exactly?
|
||||
Imagine that Vitastor is replaced with any other storage (for example, with a proprietary).
|
||||
Do control panel functions change in any way? If they do (for example, if snapshots stop working),
|
||||
then the panel contains specific functionality and thus is designed specially for use with Vitastor.
|
||||
Otherwise, the panel is universal and isn't designed specially for Vitastor.
|
||||
- So, whether you are required to open-source the panel also **depends** on whether it
|
||||
contains specific functionality or not.
|
||||
|
||||
## Why?
|
||||
|
||||
Because I believe into the spirit of copyleft (Linux wouldn't become so popular without GPL!)
|
||||
and, at the same time, I want to have a way to monetize the product.
|
||||
|
||||
Existing licenses including AGPL are useless for it with an SDS - SDS is a very deeply
|
||||
internal software which is almost definitely invisible to the user and thus AGPL doesn't
|
||||
require anyone to open the code even if they make a proprietary fork.
|
||||
|
||||
And, in fact, the current situation in the world where GPL is though to only restrict direct
|
||||
linking of programs into a single executable file, isn't much correct. Nowadays, programs
|
||||
are more often linked with network API calls, not with /usr/bin/ld, and a software product
|
||||
may consist of dozens of microservices interacting with each other over the network.
|
||||
|
||||
That's why we need VNPL to keep the license sufficiently copyleft.
|
||||
|
||||
## License Texts
|
||||
|
||||
- VNPL 1.1 in English: [VNPL-1.1.txt](../../VNPL-1.1.txt)
|
||||
- VNPL 1.1 in Russian: [VNPL-1.1-RU.txt](../../VNPL-1.1-RU.txt)
|
||||
- GPL 2.0: [GPL-2.0.txt](../../GPL-2.0.txt)
|
||||
You can find the full text of VNPL-1.1 in the file [VNPL-1.1.txt](../../VNPL-1.1.txt).
|
||||
GPL 2.0 is also included in this repository as [GPL-2.0.txt](../../GPL-2.0.txt).
|
||||
|
@@ -12,14 +12,6 @@
|
||||
|
||||
Лицензия: VNPL 1.1 на серверный код и двойная VNPL 1.1 + GPL 2.0+ на клиентский.
|
||||
|
||||
Серверные компоненты распространяются только на условиях VNPL.
|
||||
|
||||
Клиентские библиотеки распространяются на условиях двойной лицензии VNPL 1.0
|
||||
и также на условиях GNU GPL 2.0 или более поздней версии. Так сделано в целях
|
||||
совместимости с таким ПО, как QEMU и fio.
|
||||
|
||||
## VNPL
|
||||
|
||||
VNPL - "сетевой копилефт", собственная свободная копилефт-лицензия
|
||||
Vitastor Network Public License 1.1, основанная на GNU GPL 3.0 с дополнительным
|
||||
условием "Сетевого взаимодействия", требующим распространять все программы,
|
||||
@@ -37,70 +29,9 @@ Vitastor Network Public License 1.1, основанная на GNU GPL 3.0 с д
|
||||
На Windows и любое другое ПО, не разработанное *специально* для использования
|
||||
вместе с Vitastor, никакие ограничения не накладываются.
|
||||
|
||||
## Пояснение
|
||||
Клиентские библиотеки распространяются на условиях двойной лицензии VNPL 1.0
|
||||
и также на условиях GNU GPL 2.0 или более поздней версии. Так сделано в целях
|
||||
совместимости с таким ПО, как QEMU и fio.
|
||||
|
||||
Сетевой копилефт регулируется пунктом лицензии **13. Удалённое сетевое взаимодействие**.
|
||||
|
||||
Программа считается "прокси-программой", если верны оба условия:
|
||||
- Она создана специально для работы вместе с Vitastor. По сути это означает, что программа
|
||||
должна иметь специфичный для Vitastor функционал, то есть, "знать", что она взаимодействует
|
||||
именно с Vitastor.
|
||||
- Она прямо или косвенно взаимодействует с Vitastor через абсолютно любой программный
|
||||
интерфейс, включая любые способы вызова: API, CLI, сеть или через какую-то обёртку (в
|
||||
свою очередь тоже являющуюся прокси-программой).
|
||||
|
||||
Если в дополнение к этому также:
|
||||
- Вы предоставляете любому пользователю возможность взаимодействовать с Vitastor по сети,
|
||||
опять-таки, через любой интерфейс или любую серию "обёрток" (прокси-программ)
|
||||
|
||||
То, согласно VNPL, вы должны открыть код "прокси-программ" **таким пользователям** на условиях
|
||||
любой GPL-совместимой лицензии - то есть, GPL, LGPL, MIT/BSD или Apache 2 - "совместимость с GPL"
|
||||
понимается как возможность включать лицензируемый код в GPL-приложение.
|
||||
|
||||
Соответственно, если у вас есть "прокси-программа", но её код не открыт пользователю,
|
||||
который прямо или косвенно взаимодействует с Vitastor - вам запрещено использовать Vitastor
|
||||
на условиях VNPL и вам нужна коммерческая лицензия, не содержащая требований об открытии кода.
|
||||
|
||||
## Примеры
|
||||
|
||||
- Kubernetes CSI-драйвер Vitastor, создающий PersistentVolume с помощью вызова `vitastor-cli create`.
|
||||
- Да, взаимодействует с Vitastor через vitastor-cli.
|
||||
- Да, создавался специально для работы с Vitastor (иначе в чём же ещё его смысл).
|
||||
- Значит, CSI-драйвер **точно считается** "прокси-программой" и должен быть открыт под свободной
|
||||
лицензией.
|
||||
- Windows, установленный в виртуальную машину на диске Vitastor.
|
||||
- Да, взаимодействует с Vitastor "прямо или косвенно" - пишет и читает данные через интерфейс
|
||||
блочного устройства, эмулируемый QEMU.
|
||||
- Нет, точно не создан *специально для работы с Vitastor* - когда его создавали, никакого
|
||||
Vitastor ещё и в помине не было.
|
||||
- Значит, Windows **точно не считается** "прокси-программой" и на него требования VNPL не распространяются.
|
||||
- Панель управления облака, делающая запросы к Kubernetes CSI-драйверу Vitastor.
|
||||
- Да, взаимодействует с Vitastor косвенно через CSI-драйвер, являющийся "прокси-программой".
|
||||
- Сходу не известно, создавалась ли конкретно для работы с Vitastor. Как понять, да или нет?
|
||||
Представьте, что Vitastor заменён на любую другую систему хранения (например, на проприетарную).
|
||||
Работа панели управления изменится? Если да (например, перестанут работать снапшоты) - значит,
|
||||
панель содержит специфичный функционал и "создана специально для работы с Vitastor".
|
||||
Если нет - значит, специфичного функционала панель не содержит и в принципе она универсальна.
|
||||
- Нужно ли открывать панель - **зависит** от того, содержит она специфичный функционал или нет.
|
||||
|
||||
## Почему так?
|
||||
|
||||
Потому что я одновременно верю в дух копилефт-лицензий (Linux не стал бы так популярен,
|
||||
если бы не GPL!) и хочу иметь возможность монетизации продукта.
|
||||
|
||||
При этом использовать даже AGPL для программной СХД бессмысленно - это глубоко внутреннее
|
||||
ПО, которое пользователь почти наверняка не увидит вообще, поэтому и открывать код никому
|
||||
никогда не придётся, даже при создании производного продукта.
|
||||
|
||||
Да и в целом сложившаяся в мире ситуация, при которой действие GPL ограничивается только
|
||||
прямым связыванием в один исполняемый файл, не очень корректна. В настоящее время программы
|
||||
гораздо чаще интегрируют сетевыми вызовами, а не с помощью /usr/bin/ld, и общий программный
|
||||
продукт может состоять из нескольких десятков микросервисов, взаимодействующих по сети.
|
||||
|
||||
Поэтому для сохранения достаточной "копилефтности" и придумана VNPL.
|
||||
|
||||
## Тексты лицензий
|
||||
|
||||
- VNPL 1.1 на английском языке: [VNPL-1.1.txt](../../VNPL-1.1.txt)
|
||||
- VNPL 1.1 на русском языке: [VNPL-1.1-RU.txt](../../VNPL-1.1-RU.txt)
|
||||
- GPL 2.0: [GPL-2.0.txt](../../GPL-2.0.txt)
|
||||
Вы можете найти полный текст VNPL 1.1 на английском языке в файле [VNPL-1.1.txt](../../VNPL-1.1.txt),
|
||||
VNPL 1.1 на русском языке в файле [VNPL-1.1-RU.txt](../../VNPL-1.1-RU.txt), а GPL 2.0 в файле [GPL-2.0.txt](../../GPL-2.0.txt).
|
||||
|
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "vitastor-mon",
|
||||
"version": "2.2.2",
|
||||
"version": "2.2.0",
|
||||
"description": "Vitastor SDS monitor service",
|
||||
"main": "mon-main.js",
|
||||
"scripts": {
|
||||
|
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "vitastor",
|
||||
"version": "2.2.2",
|
||||
"version": "2.2.0",
|
||||
"description": "Low-level native bindings to Vitastor client library",
|
||||
"main": "index.js",
|
||||
"keywords": [
|
||||
|
@@ -410,8 +410,8 @@ sub volume_size_info
|
||||
my $prefix = defined $scfg->{vitastor_prefix} ? $scfg->{vitastor_prefix} : 'pve/';
|
||||
my ($vtype, $name, $vmid) = $class->parse_volname($volname);
|
||||
my $info = _process_list($scfg, $storeid, run_cli($scfg, [ 'ls', $prefix.$name ]))->[0];
|
||||
# (size, format, used, parent, ctime)
|
||||
return wantarray ? ($info->{size}, $info->{format}, $info->{size}, $info->{parent}, 0) : $info->{size};
|
||||
#return wantarray ? ($size, $format, $used, $parent, $st->ctime) : $size;
|
||||
return $info->{size};
|
||||
}
|
||||
|
||||
sub volume_resize
|
||||
|
@@ -50,7 +50,7 @@ from cinder.volume import configuration
|
||||
from cinder.volume import driver
|
||||
from cinder.volume import volume_utils
|
||||
|
||||
VITASTOR_VERSION = '2.2.2'
|
||||
VITASTOR_VERSION = '2.2.0'
|
||||
|
||||
LOG = logging.getLogger(__name__)
|
||||
|
||||
|
@@ -1,11 +1,11 @@
|
||||
Name: vitastor
|
||||
Version: 2.2.2
|
||||
Version: 2.2.0
|
||||
Release: 1%{?dist}
|
||||
Summary: Vitastor, a fast software-defined clustered block storage
|
||||
|
||||
License: Vitastor Network Public License 1.1
|
||||
URL: https://vitastor.io/
|
||||
Source0: vitastor-2.2.2.el7.tar.gz
|
||||
Source0: vitastor-2.2.0.el7.tar.gz
|
||||
|
||||
BuildRequires: liburing-devel >= 0.6
|
||||
BuildRequires: gperftools-devel
|
||||
|
@@ -1,11 +1,11 @@
|
||||
Name: vitastor
|
||||
Version: 2.2.2
|
||||
Version: 2.2.0
|
||||
Release: 1%{?dist}
|
||||
Summary: Vitastor, a fast software-defined clustered block storage
|
||||
|
||||
License: Vitastor Network Public License 1.1
|
||||
URL: https://vitastor.io/
|
||||
Source0: vitastor-2.2.2.el8.tar.gz
|
||||
Source0: vitastor-2.2.0.el8.tar.gz
|
||||
|
||||
BuildRequires: liburing-devel >= 0.6
|
||||
BuildRequires: gperftools-devel
|
||||
|
@@ -1,11 +1,11 @@
|
||||
Name: vitastor
|
||||
Version: 2.2.2
|
||||
Version: 2.2.0
|
||||
Release: 1%{?dist}
|
||||
Summary: Vitastor, a fast software-defined clustered block storage
|
||||
|
||||
License: Vitastor Network Public License 1.1
|
||||
URL: https://vitastor.io/
|
||||
Source0: vitastor-2.2.2.el9.tar.gz
|
||||
Source0: vitastor-2.2.0.el9.tar.gz
|
||||
|
||||
BuildRequires: liburing-devel >= 0.6
|
||||
BuildRequires: gperftools-devel
|
||||
|
@@ -19,7 +19,7 @@ if("${CMAKE_INSTALL_PREFIX}" MATCHES "^/usr/local/?$")
|
||||
set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}")
|
||||
endif()
|
||||
|
||||
add_definitions(-DVITASTOR_VERSION="2.2.2")
|
||||
add_definitions(-DVITASTOR_VERSION="2.2.0")
|
||||
add_definitions(-D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64 -Wall -Wno-sign-compare -Wno-comment -Wno-parentheses -Wno-pointer-arith -fdiagnostics-color=always -fno-omit-frame-pointer -I ${CMAKE_SOURCE_DIR}/src)
|
||||
add_link_options(-fno-omit-frame-pointer)
|
||||
if (${WITH_ASAN})
|
||||
|
@@ -31,6 +31,7 @@
|
||||
#define DEFAULT_DATA_BLOCK_ORDER 17
|
||||
#define MIN_DATA_BLOCK_SIZE 4*1024
|
||||
#define MAX_DATA_BLOCK_SIZE 128*1024*1024
|
||||
#define MAX_META_BLOCK_SIZE 64*1024
|
||||
#define DEFAULT_BITMAP_GRANULARITY 4096
|
||||
|
||||
#define BS_OP_MIN 1
|
||||
|
@@ -127,9 +127,9 @@ void blockstore_disk_t::parse_config(std::map<std::string, std::string> & config
|
||||
{
|
||||
throw std::runtime_error("meta_block_size must be a multiple of "+std::to_string(DIRECT_IO_ALIGNMENT));
|
||||
}
|
||||
else if (meta_block_size > MAX_DATA_BLOCK_SIZE)
|
||||
else if (meta_block_size > MAX_META_BLOCK_SIZE)
|
||||
{
|
||||
throw std::runtime_error("meta_block_size must not exceed "+std::to_string(MAX_DATA_BLOCK_SIZE));
|
||||
throw std::runtime_error("meta_block_size must not exceed "+std::to_string(MAX_META_BLOCK_SIZE));
|
||||
}
|
||||
if (data_offset % disk_alignment)
|
||||
{
|
||||
|
@@ -427,13 +427,6 @@ stop_flusher:
|
||||
printf("Flushing %jx:%jx v%ju\n", cur.oid.inode, cur.oid.stripe, cur.version);
|
||||
#endif
|
||||
flusher->active_flushers++;
|
||||
// Find it in clean_db
|
||||
{
|
||||
auto & clean_db = bs->clean_db_shard(cur.oid);
|
||||
auto clean_it = clean_db.find(cur.oid);
|
||||
old_clean_ver = (clean_it != clean_db.end() ? clean_it->second.version : 0);
|
||||
old_clean_loc = (clean_it != clean_db.end() ? clean_it->second.location : UINT64_MAX);
|
||||
}
|
||||
// Scan dirty versions of the object to determine what we need to read
|
||||
scan_dirty();
|
||||
// Writes and deletes shouldn't happen at the same time
|
||||
@@ -538,7 +531,7 @@ resume_2:
|
||||
{
|
||||
// zero out old metadata entry
|
||||
{
|
||||
clean_disk_entry *old_entry = (clean_disk_entry*)((uint8_t*)meta_old.buf + meta_old.pos*bs->dsk.clean_entry_size);
|
||||
clean_disk_entry *old_entry = (clean_disk_entry*)((uint8_t*)meta_old.buf + meta_old.pos);
|
||||
if (old_entry->oid.inode != 0 && old_entry->oid != cur.oid)
|
||||
{
|
||||
printf("Fatal error (metadata corruption or bug): tried to wipe metadata entry %ju (%jx:%jx v%ju) as old location of %jx:%jx\n",
|
||||
@@ -547,7 +540,7 @@ resume_2:
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
memset((uint8_t*)meta_old.buf + meta_old.pos*bs->dsk.clean_entry_size, 0, bs->dsk.clean_entry_size);
|
||||
memset((uint8_t*)meta_old.buf + meta_old.pos, 0, bs->dsk.clean_entry_size);
|
||||
resume_20:
|
||||
if (meta_old.sector != meta_new.sector && !write_meta_block(meta_old, 20))
|
||||
return false;
|
||||
@@ -608,7 +601,7 @@ resume_2:
|
||||
|
||||
void journal_flusher_co::update_metadata_entry()
|
||||
{
|
||||
clean_disk_entry *new_entry = (clean_disk_entry*)((uint8_t*)meta_new.buf + meta_new.pos*bs->dsk.clean_entry_size);
|
||||
clean_disk_entry *new_entry = (clean_disk_entry*)((uint8_t*)meta_new.buf + meta_new.pos);
|
||||
if (new_entry->oid.inode != 0 && new_entry->oid != cur.oid)
|
||||
{
|
||||
printf(
|
||||
@@ -623,7 +616,7 @@ void journal_flusher_co::update_metadata_entry()
|
||||
if (has_delete)
|
||||
{
|
||||
// Zero out the new metadata entry
|
||||
memset((uint8_t*)meta_new.buf + meta_new.pos*bs->dsk.clean_entry_size, 0, bs->dsk.clean_entry_size);
|
||||
memset((uint8_t*)meta_new.buf + meta_new.pos, 0, bs->dsk.clean_entry_size);
|
||||
}
|
||||
else
|
||||
{
|
||||
@@ -805,7 +798,7 @@ bool journal_flusher_co::clear_incomplete_csum_block_bits(int wait_base)
|
||||
}
|
||||
}
|
||||
{
|
||||
clean_disk_entry *new_entry = (clean_disk_entry*)((uint8_t*)meta_new.buf + meta_new.pos*bs->dsk.clean_entry_size);
|
||||
clean_disk_entry *new_entry = (clean_disk_entry*)((uint8_t*)meta_new.buf + meta_new.pos);
|
||||
if (new_entry->oid != cur.oid)
|
||||
{
|
||||
printf(
|
||||
@@ -912,6 +905,12 @@ void journal_flusher_co::calc_block_checksums(uint32_t *new_data_csums, bool ski
|
||||
|
||||
void journal_flusher_co::scan_dirty()
|
||||
{
|
||||
// Find it in clean_db
|
||||
auto & clean_db = bs->clean_db_shard(cur.oid);
|
||||
auto clean_it = clean_db.find(cur.oid);
|
||||
old_clean_ver = (clean_it != clean_db.end() ? clean_it->second.version : 0);
|
||||
old_clean_loc = (clean_it != clean_db.end() ? clean_it->second.location : UINT64_MAX);
|
||||
auto old_clean_bitmap = (clean_it != clean_db.end() ? bs->get_clean_entry_bitmap(clean_it, 0) : NULL);
|
||||
dirty_it = dirty_start = dirty_end;
|
||||
v.clear();
|
||||
copy_count = 0;
|
||||
@@ -1037,13 +1036,12 @@ void journal_flusher_co::scan_dirty()
|
||||
read_to_fill_incomplete = 0;
|
||||
return;
|
||||
}
|
||||
uint8_t *bmp_ptr = bs->get_clean_entry_bitmap(old_clean_loc, 0);
|
||||
uint64_t fulfilled = 0;
|
||||
int last = v.size()-1;
|
||||
while (last >= 0 && (v[last].copy_flags & COPY_BUF_CSUM_FILL))
|
||||
last--;
|
||||
read_to_fill_incomplete = bs->fill_partial_checksum_blocks(
|
||||
v, fulfilled, bmp_ptr, NULL, false, NULL, v[0].offset/bs->dsk.csum_block_size * bs->dsk.csum_block_size,
|
||||
v, fulfilled, old_clean_bitmap, NULL, false, NULL, v[0].offset/bs->dsk.csum_block_size * bs->dsk.csum_block_size,
|
||||
((v[last].offset+v[last].len-1) / bs->dsk.csum_block_size + 1) * bs->dsk.csum_block_size
|
||||
);
|
||||
}
|
||||
@@ -1139,7 +1137,7 @@ bool journal_flusher_co::modify_meta_do_reads(int wait_base)
|
||||
resume_0:
|
||||
if (!modify_meta_read(clean_loc, meta_new, wait_base+0))
|
||||
return false;
|
||||
new_clean_bitmap = (uint8_t*)meta_new.buf + meta_new.pos*bs->dsk.clean_entry_size + sizeof(clean_disk_entry);
|
||||
new_clean_bitmap = (uint8_t*)meta_new.buf + meta_new.pos + sizeof(clean_disk_entry);
|
||||
if (old_clean_loc != UINT64_MAX && old_clean_loc != clean_loc)
|
||||
{
|
||||
resume_1:
|
||||
@@ -1193,7 +1191,7 @@ bool journal_flusher_co::modify_meta_read(uint64_t meta_loc, flusher_meta_write_
|
||||
// so I'll avoid it as long as I can.
|
||||
wr.submitted = false;
|
||||
wr.sector = ((meta_loc >> bs->dsk.block_order) / (bs->dsk.meta_block_size / bs->dsk.clean_entry_size)) * bs->dsk.meta_block_size;
|
||||
wr.pos = ((meta_loc >> bs->dsk.block_order) % (bs->dsk.meta_block_size / bs->dsk.clean_entry_size));
|
||||
wr.pos = ((meta_loc >> bs->dsk.block_order) % (bs->dsk.meta_block_size / bs->dsk.clean_entry_size)) * bs->dsk.clean_entry_size;
|
||||
if (bs->inmemory_meta)
|
||||
{
|
||||
wr.buf = (uint8_t*)bs->metadata_buffer + wr.sector;
|
||||
|
@@ -42,6 +42,8 @@ blockstore_impl_t::~blockstore_impl_t()
|
||||
free(metadata_buffer);
|
||||
if (clean_bitmaps)
|
||||
free(clean_bitmaps);
|
||||
if (heap_meta.blocks)
|
||||
delete[] heap_meta.blocks;
|
||||
}
|
||||
|
||||
bool blockstore_impl_t::is_started()
|
||||
@@ -431,13 +433,29 @@ blockstore_clean_db_t& blockstore_impl_t::clean_db_shard(object_id oid)
|
||||
{
|
||||
uint64_t pg_num = 0;
|
||||
uint64_t pool_id = (oid.inode >> (64-POOL_ID_BITS));
|
||||
auto sh_it = clean_db_settings.find(pool_id);
|
||||
if (sh_it != clean_db_settings.end())
|
||||
auto sett_it = clean_db_settings.find(pool_id);
|
||||
if (sett_it != clean_db_settings.end())
|
||||
{
|
||||
// like map_to_pg()
|
||||
pg_num = (oid.stripe / sh_it->second.pg_stripe_size) % sh_it->second.pg_count + 1;
|
||||
pg_num = (oid.stripe / sett_it->second.pg_stripe_size) % sett_it->second.pg_count + 1;
|
||||
}
|
||||
return clean_db_shards[(pool_id << (64-POOL_ID_BITS)) | pg_num];
|
||||
auto shard_id = (pool_id << (64-POOL_ID_BITS)) | pg_num;
|
||||
if (dsk.meta_format == BLOCKSTORE_META_FORMAT_HEAP)
|
||||
{
|
||||
auto sh_it = clean_db_shards.find(shard_id);
|
||||
if (sh_it == clean_db_shards.end())
|
||||
{
|
||||
// clean_db_t stores larger entries with heap_meta, but we disguise it as smaller clean_entry :)
|
||||
// patched cpp-btree with extra_data
|
||||
clean_db_shards[shard_id] = blockstore_clean_db_t(
|
||||
sizeof(clean_entry_heap_t) - sizeof(clean_entry)
|
||||
+ (inmemory_meta ? dsk.clean_dyn_size : 2*dsk.clean_entry_bitmap_size)
|
||||
);
|
||||
return clean_db_shards[shard_id];
|
||||
}
|
||||
return sh_it->second;
|
||||
}
|
||||
return clean_db_shards[shard_id];
|
||||
}
|
||||
|
||||
void blockstore_impl_t::reshard_clean_db(pool_id_t pool, uint32_t pg_count, uint32_t pg_stripe_size)
|
||||
|
@@ -96,6 +96,9 @@
|
||||
#define BLOCKSTORE_META_MAGIC_V1 0x726F747341544956l
|
||||
#define BLOCKSTORE_META_FORMAT_V1 1
|
||||
#define BLOCKSTORE_META_FORMAT_V2 2
|
||||
#define BLOCKSTORE_META_FORMAT_HEAP 3
|
||||
#define BLOCKSTORE_META_HEADER_V1_SIZE 36
|
||||
#define BLOCKSTORE_META_HEADER_V2_SIZE 48
|
||||
|
||||
// metadata header (superblock)
|
||||
struct __attribute__((__packed__)) blockstore_meta_header_v1_t
|
||||
@@ -119,6 +122,7 @@ struct __attribute__((__packed__)) blockstore_meta_header_v2_t
|
||||
uint32_t data_csum_type;
|
||||
uint32_t csum_block_size;
|
||||
uint32_t header_csum;
|
||||
uint32_t block_id_bits; // 32 by default in heap meta
|
||||
};
|
||||
|
||||
// 32 bytes = 24 bytes + block bitmap (4 bytes by default) + external attributes (also bitmap, 4 bytes by default)
|
||||
@@ -140,6 +144,62 @@ struct __attribute__((__packed__)) clean_entry
|
||||
uint64_t location;
|
||||
};
|
||||
|
||||
typedef uint32_t heap_block_num_t;
|
||||
|
||||
// 50 = 16 (key=object_id) + 26 (value) + 8 (bitmap) + N (checksum) bytes per "clean" entry in memory
|
||||
struct __attribute__((__packed__)) clean_entry_heap_t
|
||||
{
|
||||
uint64_t version;
|
||||
uint64_t location; // UINT64_MAX = deleted
|
||||
// previous versions invalidated by this version
|
||||
heap_block_num_t prev_versions;
|
||||
// metadata block number
|
||||
heap_block_num_t meta_block;
|
||||
// offset within block
|
||||
uint16_t block_offset;
|
||||
uint8_t bitmap[];
|
||||
};
|
||||
|
||||
struct __attribute__((__packed__)) heap_meta_block_header_t
|
||||
{
|
||||
uint64_t magic;
|
||||
uint64_t seq_num;
|
||||
uint32_t invalidates_blocks;
|
||||
};
|
||||
|
||||
// 48+checksums = (40+bitmap)+checksums bytes per on-disk "heap" entry
|
||||
// for 128 KB block without checksums, it's 48 bytes - 84 entries per 4 kb metadata block
|
||||
// for 128 KB block with 4k checksums, it's 176 bytes - 22 entries per 4 kb metadata block
|
||||
// for 1 MB block without checksums, it's 80 bytes - 50 entries per 4 kb metadata block
|
||||
// for 1 MB block with 4k checksums, it's 1104 bytes O_o - only 3 entries per 4 kb metadata block
|
||||
// for 1 MB block with 32k checksums, it's 176 bytes again
|
||||
struct __attribute__((__packed__)) heap_meta_entry_t
|
||||
{
|
||||
object_id oid;
|
||||
uint64_t version;
|
||||
uint64_t location; // UINT64_MAX = deleted
|
||||
uint64_t reserved;
|
||||
uint8_t bitmap[];
|
||||
};
|
||||
|
||||
struct heap_meta_block_t
|
||||
{
|
||||
heap_block_num_t offset = 0;
|
||||
uint64_t seq_num = 0;
|
||||
uint32_t used_space = 0;
|
||||
std::vector<uint64_t> invalidates_blocks;
|
||||
};
|
||||
|
||||
struct heap_meta_t
|
||||
{
|
||||
heap_block_num_t block_count = 0;
|
||||
heap_meta_block_t *blocks = NULL;
|
||||
// used space => block number
|
||||
std::multimap<uint32_t, heap_block_num_t> used_space_map;
|
||||
heap_block_num_t cur_written_block = 0;
|
||||
uint8_t *written_block_buf = NULL;
|
||||
};
|
||||
|
||||
// 64 = 24 + 40 bytes per dirty entry in memory (obj_ver_id => dirty_entry). Plus checksums
|
||||
struct __attribute__((__packed__)) dirty_entry
|
||||
{
|
||||
@@ -272,6 +332,8 @@ class blockstore_impl_t
|
||||
|
||||
struct ring_consumer_t ring_consumer;
|
||||
|
||||
heap_meta_t heap_meta;
|
||||
|
||||
std::map<pool_id_t, pool_shard_settings_t> clean_db_settings;
|
||||
std::map<pool_pg_id_t, blockstore_clean_db_t> clean_db_shards;
|
||||
std::map<uint64_t, int> no_inode_stats;
|
||||
@@ -317,7 +379,7 @@ class blockstore_impl_t
|
||||
void open_data();
|
||||
void open_meta();
|
||||
void open_journal();
|
||||
uint8_t* get_clean_entry_bitmap(uint64_t block_loc, int offset);
|
||||
uint8_t* get_clean_entry_bitmap(blockstore_clean_db_t::iterator clean_it, int offset);
|
||||
|
||||
blockstore_clean_db_t& clean_db_shard(object_id oid);
|
||||
void reshard_clean_db(pool_id_t pool_id, uint32_t pg_count, uint32_t pg_stripe_size);
|
||||
@@ -345,9 +407,9 @@ class blockstore_impl_t
|
||||
uint64_t &fulfilled, uint32_t item_start, uint32_t item_end,
|
||||
uint32_t item_state, uint64_t item_version, uint64_t item_location,
|
||||
uint64_t journal_sector, uint8_t *csum, int *dyn_data);
|
||||
bool fulfill_clean_read(blockstore_op_t *read_op, uint64_t & fulfilled,
|
||||
uint8_t *clean_entry_bitmap, int *dyn_data,
|
||||
uint32_t item_start, uint32_t item_end, uint64_t clean_loc, uint64_t clean_ver);
|
||||
bool fulfill_clean_read_journal(blockstore_op_t *read_op, uint64_t & fulfilled,
|
||||
uint8_t *clean_entry_bitmap, int *dyn_data, uint32_t item_start, uint32_t item_end, uint64_t clean_loc, uint64_t clean_ver);
|
||||
bool fulfill_clean_read_meta(blockstore_op_t *read_op, uint64_t & fulfilled, blockstore_clean_db_t::iterator clean_it);
|
||||
int fill_partial_checksum_blocks(std::vector<copy_buffer_t> & rv, uint64_t & fulfilled,
|
||||
uint8_t *clean_entry_bitmap, int *dyn_data, bool from_journal, uint8_t *read_buf, uint64_t read_offset, uint64_t read_end);
|
||||
int pad_journal_read(std::vector<copy_buffer_t> & rv, copy_buffer_t & cp,
|
||||
@@ -356,7 +418,7 @@ class blockstore_impl_t
|
||||
bool read_range_fulfilled(std::vector<copy_buffer_t> & rv, uint64_t & fulfilled, uint8_t *read_buf,
|
||||
uint8_t *clean_entry_bitmap, uint32_t item_start, uint32_t item_end);
|
||||
bool read_checksum_block(blockstore_op_t *op, int rv_pos, uint64_t &fulfilled, uint64_t clean_loc);
|
||||
uint8_t* read_clean_meta_block(blockstore_op_t *read_op, uint64_t clean_loc, int rv_pos);
|
||||
uint8_t* read_clean_meta_block(blockstore_op_t *op, blockstore_clean_db_t::iterator clean_it, int rv_pos);
|
||||
bool verify_padded_checksums(uint8_t *clean_entry_bitmap, uint8_t *csum_buf, uint32_t offset,
|
||||
iovec *iov, int n_iov, std::function<void(uint32_t, uint32_t, uint32_t)> bad_block_cb);
|
||||
bool verify_journal_checksums(uint8_t *csums, uint32_t offset,
|
||||
|
@@ -54,6 +54,7 @@ int blockstore_init_meta::loop()
|
||||
else if (wait_state == 4) goto resume_4;
|
||||
else if (wait_state == 5) goto resume_5;
|
||||
else if (wait_state == 6) goto resume_6;
|
||||
else if (wait_state == 7) goto resume_7;
|
||||
printf("Reading blockstore metadata\n");
|
||||
if (bs->inmemory_meta)
|
||||
metadata_buffer = bs->metadata_buffer;
|
||||
@@ -78,6 +79,7 @@ resume_1:
|
||||
if (iszero((uint64_t*)metadata_buffer, bs->dsk.meta_block_size / sizeof(uint64_t)))
|
||||
{
|
||||
{
|
||||
memset(metadata_buffer, 0, bs->dsk.meta_block_size);
|
||||
blockstore_meta_header_v2_t *hdr = (blockstore_meta_header_v2_t *)metadata_buffer;
|
||||
hdr->zero = 0;
|
||||
hdr->magic = BLOCKSTORE_META_MAGIC_V1;
|
||||
@@ -85,12 +87,19 @@ resume_1:
|
||||
hdr->meta_block_size = bs->dsk.meta_block_size;
|
||||
hdr->data_block_size = bs->dsk.data_block_size;
|
||||
hdr->bitmap_granularity = bs->dsk.bitmap_granularity;
|
||||
if (bs->dsk.meta_format >= BLOCKSTORE_META_FORMAT_HEAP)
|
||||
{
|
||||
hdr->block_id_bits = sizeof(heap_block_num_t);
|
||||
}
|
||||
if (bs->dsk.meta_format >= BLOCKSTORE_META_FORMAT_V2)
|
||||
{
|
||||
hdr->data_csum_type = bs->dsk.data_csum_type;
|
||||
hdr->csum_block_size = bs->dsk.csum_block_size;
|
||||
hdr->header_csum = 0;
|
||||
hdr->header_csum = crc32c(0, hdr, sizeof(*hdr));
|
||||
hdr->header_csum = crc32c(0, hdr,
|
||||
bs->dsk.meta_format == BLOCKSTORE_META_FORMAT_V2
|
||||
? BLOCKSTORE_META_HEADER_V2_SIZE
|
||||
: sizeof(*hdr));
|
||||
}
|
||||
}
|
||||
if (bs->readonly)
|
||||
@@ -128,7 +137,7 @@ resume_1:
|
||||
);
|
||||
exit(1);
|
||||
}
|
||||
if (hdr->version == BLOCKSTORE_META_FORMAT_V2)
|
||||
if (hdr->version == BLOCKSTORE_META_FORMAT_HEAP)
|
||||
{
|
||||
uint32_t csum = hdr->header_csum;
|
||||
hdr->header_csum = 0;
|
||||
@@ -138,6 +147,23 @@ resume_1:
|
||||
exit(1);
|
||||
}
|
||||
hdr->header_csum = csum;
|
||||
bs->dsk.meta_format = BLOCKSTORE_META_FORMAT_HEAP;
|
||||
if (hdr->block_id_bits != sizeof(heap_block_num_t))
|
||||
{
|
||||
printf("Heap metadata block ID size (%u) is not supported by this build\n", hdr->block_id_bits);
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
else if (hdr->version == BLOCKSTORE_META_FORMAT_V2)
|
||||
{
|
||||
uint32_t csum = hdr->header_csum;
|
||||
hdr->header_csum = 0;
|
||||
if (crc32c(0, hdr, BLOCKSTORE_META_HEADER_V2_SIZE) != csum)
|
||||
{
|
||||
printf("Metadata header is corrupt (checksum mismatch).\n");
|
||||
exit(1);
|
||||
}
|
||||
hdr->header_csum = csum;
|
||||
if (bs->dsk.meta_format != BLOCKSTORE_META_FORMAT_V2)
|
||||
{
|
||||
bs->dsk.meta_format = BLOCKSTORE_META_FORMAT_V2;
|
||||
@@ -160,11 +186,11 @@ resume_1:
|
||||
printf("Warning: Starting with metadata in the old format without checksums, as stored on disk\n");
|
||||
}
|
||||
}
|
||||
else if (hdr->version > BLOCKSTORE_META_FORMAT_V2)
|
||||
else
|
||||
{
|
||||
printf(
|
||||
"Metadata format is too new for me (stored version is %ju, max supported %u).\n",
|
||||
hdr->version, BLOCKSTORE_META_FORMAT_V2
|
||||
hdr->version, BLOCKSTORE_META_FORMAT_HEAP
|
||||
);
|
||||
exit(1);
|
||||
}
|
||||
@@ -189,7 +215,12 @@ resume_1:
|
||||
// Skip superblock
|
||||
md_offset = bs->dsk.meta_block_size;
|
||||
next_offset = md_offset;
|
||||
entries_per_block = bs->dsk.meta_block_size / bs->dsk.clean_entry_size;
|
||||
entries_per_block = bs->dsk.meta_block_size / bs->dsk.clean_entry_size; // FIXME only array
|
||||
if (bs->dsk.meta_format == BLOCKSTORE_META_FORMAT_HEAP)
|
||||
{
|
||||
bs->heap_meta.blocks = new heap_meta_block_t[bs->dsk.meta_len / bs->dsk.meta_block_size];
|
||||
bs->heap_meta.block_count = bs->dsk.meta_len / bs->dsk.meta_block_size;
|
||||
}
|
||||
// Read the rest of the metadata
|
||||
resume_2:
|
||||
if (next_offset < bs->dsk.meta_len && submitted == 0)
|
||||
@@ -233,9 +264,10 @@ resume_2:
|
||||
bool changed = false;
|
||||
for (uint64_t sector = 0; sector < bufs[i].size; sector += bs->dsk.meta_block_size)
|
||||
{
|
||||
// handle <count> entries
|
||||
if (handle_meta_block(bufs[i].buf + sector, entries_per_block,
|
||||
((bufs[i].offset + sector - md_offset) / bs->dsk.meta_block_size) * entries_per_block))
|
||||
auto this_changed = bs->dsk.meta_format == BLOCKSTORE_META_FORMAT_HEAP
|
||||
? handle_heap_meta_block(bufs[i].buf + sector, bufs[i].offset + sector - md_offset)
|
||||
: handle_array_meta_block(bufs[i].buf + sector, bufs[i].offset + sector - md_offset);
|
||||
if (this_changed)
|
||||
changed = true;
|
||||
}
|
||||
if (changed && !bs->inmemory_meta && !bs->readonly)
|
||||
@@ -262,6 +294,41 @@ resume_2:
|
||||
wait_state = 2;
|
||||
return 1;
|
||||
}
|
||||
if (bs->dsk.meta_format == BLOCKSTORE_META_FORMAT_HEAP)
|
||||
{
|
||||
// build used_space index
|
||||
for (heap_block_num_t i = 0; i < bs->heap_meta.block_count; i++)
|
||||
{
|
||||
bs->heap_meta.used_space_map.emplace(std::pair<uint32_t, heap_block_num_t>(bs->heap_meta.blocks[i].used_space, i));
|
||||
}
|
||||
}
|
||||
if (heap_invalidated_block_seq.size() && !bs->readonly)
|
||||
{
|
||||
// zero out invalidated blocks not zeroed during the previous OSD execution
|
||||
for (auto inv_seq: heap_invalidated_block_seq)
|
||||
{
|
||||
auto num_it = heap_block_by_seq.find(inv_seq);
|
||||
if (num_it != heap_block_by_seq.end())
|
||||
heap_invalidated_block_nums.push_back(num_it->second);
|
||||
}
|
||||
memset(metadata_buffer, 0, bs->dsk.meta_block_size);
|
||||
for (i = 0; i < heap_invalidated_block_nums.size(); i++)
|
||||
{
|
||||
GET_SQE();
|
||||
last_read_offset = heap_invalidated_block_nums[i]*bs->dsk.meta_block_size;
|
||||
data->iov = { metadata_buffer, (size_t)bs->dsk.meta_block_size };
|
||||
data->callback = [this](ring_data_t *data) { handle_event(data, -1); };
|
||||
my_uring_prep_writev(sqe, bs->dsk.meta_fd, &data->iov, 1, bs->dsk.meta_offset + last_read_offset);
|
||||
bs->ringloop->submit();
|
||||
submitted++;
|
||||
resume_7:
|
||||
if (submitted > 0)
|
||||
{
|
||||
wait_state = 7;
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (entries_to_zero.size() && !bs->inmemory_meta && !bs->readonly)
|
||||
{
|
||||
std::sort(entries_to_zero.begin(), entries_to_zero.end());
|
||||
@@ -329,8 +396,9 @@ resume_6:
|
||||
return 0;
|
||||
}
|
||||
|
||||
bool blockstore_init_meta::handle_meta_block(uint8_t *buf, uint64_t entries_per_block, uint64_t done_cnt)
|
||||
bool blockstore_init_meta::handle_array_meta_block(uint8_t *buf, uint64_t block_offset)
|
||||
{
|
||||
uint64_t done_cnt = (block_offset / bs->dsk.meta_block_size) * entries_per_block;
|
||||
bool updated = false;
|
||||
uint64_t max_i = entries_per_block;
|
||||
if (max_i > bs->dsk.block_count-done_cnt)
|
||||
@@ -429,6 +497,132 @@ bool blockstore_init_meta::handle_meta_block(uint8_t *buf, uint64_t entries_per_
|
||||
return updated;
|
||||
}
|
||||
|
||||
static int bitmap_count_ones(uint8_t *bitmap, int size)
|
||||
{
|
||||
int n = 0, i = 0;
|
||||
for (; i <= size-sizeof(unsigned); i += sizeof(unsigned))
|
||||
{
|
||||
n += __builtin_popcount(*(unsigned*)(bitmap+i));
|
||||
}
|
||||
for (; i < size; i++)
|
||||
{
|
||||
n += __builtin_popcount(*(unsigned char*)(bitmap+i));
|
||||
}
|
||||
return n;
|
||||
}
|
||||
|
||||
// v3 / heap / "cow" metadata block
|
||||
bool blockstore_init_meta::handle_heap_meta_block(uint8_t *buf, uint64_t block_offset)
|
||||
{
|
||||
if ((block_offset / bs->dsk.meta_block_size) > (heap_block_num_t)-1)
|
||||
{
|
||||
fprintf(stderr, "Metadata area too large\n");
|
||||
exit(1);
|
||||
}
|
||||
// Validate block CRC
|
||||
uint32_t block_crc = *(uint32_t*)(buf + bs->dsk.meta_block_size - 4);
|
||||
if (crc32c(0, buf, bs->dsk.meta_block_size-4) != block_crc)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
// Validate header
|
||||
heap_meta_block_header_t *hdr = (heap_meta_block_header_t*)buf;
|
||||
if (hdr->magic != BLOCKSTORE_META_MAGIC_V1)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
if (hdr->invalidates_blocks > (bs->dsk.meta_block_size-4-sizeof(heap_meta_block_header_t))/sizeof(uint64_t))
|
||||
{
|
||||
fprintf(stderr, "Metadata block at %jx contains too large invalidates_blocks count: %x\n", block_offset, hdr->invalidates_blocks);
|
||||
exit(1);
|
||||
}
|
||||
if (heap_invalidated_block_seq.find(hdr->seq_num) != heap_invalidated_block_seq.end())
|
||||
{
|
||||
// Check if the block is invalidated and handled after the block that invalidates it
|
||||
return false;
|
||||
}
|
||||
uint64_t hdr_size = sizeof(heap_meta_block_header_t) + hdr->invalidates_blocks*8;
|
||||
heap_meta_block_t & blk = bs->heap_meta.blocks[block_offset/bs->dsk.meta_block_size];
|
||||
blk.offset = block_offset;
|
||||
blk.seq_num = hdr->seq_num;
|
||||
blk.used_space = hdr_size + 4;
|
||||
uint64_t *hdr_inv = (uint64_t*)(hdr + 1);
|
||||
for (int i = 0; i < hdr->invalidates_blocks; i++)
|
||||
{
|
||||
blk.invalidates_blocks.push_back(hdr_inv[i]);
|
||||
heap_invalidated_block_seq.insert(hdr_inv[i]);
|
||||
}
|
||||
heap_block_by_seq[hdr->seq_num] = block_offset;
|
||||
// Process sub-blocks
|
||||
uint64_t heap_entry_size = sizeof(heap_meta_entry_t) + bs->dsk.clean_dyn_size;
|
||||
for (uint64_t pos = sizeof(heap_meta_block_header_t); pos < bs->dsk.meta_block_size-4; pos += heap_entry_size)
|
||||
{
|
||||
heap_meta_entry_t *diskentry = (heap_meta_entry_t*)(buf + pos);
|
||||
if (!diskentry->oid.inode || !diskentry->version)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
auto & clean_db = bs->clean_db_shard(diskentry->oid);
|
||||
auto mementry = (clean_entry_heap_t*)(&clean_db[diskentry->oid]);
|
||||
bool exists = mementry->version != 0;
|
||||
if (exists && mementry->version >= diskentry->version)
|
||||
{
|
||||
if (mementry->version == diskentry->version)
|
||||
{
|
||||
// Voluntarily allow duplicates of in-memory entries with different
|
||||
// bitmaps to support checksum updates with hole-punching
|
||||
int old_count = bitmap_count_ones(mementry->bitmap, bs->dsk.clean_entry_bitmap_size);
|
||||
int new_count = bitmap_count_ones(diskentry->bitmap, bs->dsk.clean_entry_bitmap_size);
|
||||
if (old_count < new_count)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
continue;
|
||||
}
|
||||
}
|
||||
blk.used_space += heap_entry_size;
|
||||
if (exists && mementry->location != UINT64_MAX)
|
||||
{
|
||||
// free the previous block
|
||||
uint64_t old_clean_loc = mementry->location >> bs->dsk.block_order;
|
||||
#ifdef BLOCKSTORE_DEBUG
|
||||
printf("Free block %ju from %jx:%jx v%ju\n", 1+old_clean_loc,
|
||||
diskentry->oid.inode, diskentry->oid.stripe, mementry->version);
|
||||
#endif
|
||||
bs->data_alloc->set(old_clean_loc, false);
|
||||
bs->inode_space_stats[diskentry->oid.inode] -= bs->dsk.data_block_size;
|
||||
bs->used_blocks--;
|
||||
bs->heap_meta.blocks[mementry->meta_block].used_space -= heap_entry_size;
|
||||
}
|
||||
if (diskentry->location != UINT64_MAX)
|
||||
{
|
||||
bs->data_alloc->set(diskentry->location >> bs->dsk.block_order, true);
|
||||
bs->inode_space_stats[diskentry->oid.inode] += bs->dsk.data_block_size;
|
||||
bs->used_blocks++;
|
||||
#ifdef BLOCKSTORE_DEBUG
|
||||
printf("Allocate block (heap entry) %ju: %jx:%jx v%ju\n", 1 + (diskentry->location >> bs->dsk.block_order),
|
||||
diskentry->oid.inode, diskentry->oid.stripe, diskentry->version);
|
||||
#endif
|
||||
}
|
||||
mementry->version = diskentry->version;
|
||||
mementry->location = diskentry->location;
|
||||
mementry->meta_block = block_offset / bs->dsk.meta_block_size;
|
||||
mementry->block_offset = block_offset % bs->dsk.meta_block_size;
|
||||
if (exists)
|
||||
{
|
||||
mementry->prev_versions++;
|
||||
}
|
||||
// Extra data: 2 bitmaps + checksums or just 2 bitmaps if inmemory_meta is disabled
|
||||
memcpy(&mementry->bitmap, &diskentry->bitmap, bs->inmemory_meta ? bs->dsk.clean_dyn_size : 2*bs->dsk.clean_entry_bitmap_size);
|
||||
entries_loaded++;
|
||||
}
|
||||
// We have to zero out headers of invalidated blocks, but we'll do it later
|
||||
return false;
|
||||
}
|
||||
|
||||
blockstore_init_journal::blockstore_init_journal(blockstore_impl_t *bs)
|
||||
{
|
||||
this->bs = bs;
|
||||
|
@@ -28,7 +28,13 @@ class blockstore_init_meta
|
||||
unsigned entries_per_block = 0;
|
||||
int i = 0, j = 0;
|
||||
std::vector<uint64_t> entries_to_zero;
|
||||
bool handle_meta_block(uint8_t *buf, uint64_t count, uint64_t done_cnt);
|
||||
|
||||
std::map<uint64_t, heap_block_num_t> heap_block_by_seq;
|
||||
std::set<uint64_t> heap_invalidated_block_seq;
|
||||
std::vector<heap_block_num_t> heap_invalidated_block_nums;
|
||||
|
||||
bool handle_array_meta_block(uint8_t *buf, uint64_t block_offset);
|
||||
bool handle_heap_meta_block(uint8_t *buf, uint64_t block_offset);
|
||||
void handle_event(ring_data_t *data, int buf_num);
|
||||
public:
|
||||
blockstore_init_meta(blockstore_impl_t *bs);
|
||||
|
@@ -111,6 +111,10 @@ void blockstore_impl_t::parse_config(blockstore_config_t & config, bool init)
|
||||
{
|
||||
metadata_buf_size = 4*1024*1024;
|
||||
}
|
||||
if (metadata_buf_size % dsk.meta_block_size)
|
||||
{
|
||||
metadata_buf_size = ((metadata_buf_size+dsk.meta_block_size-1) / dsk.meta_block_size) * dsk.meta_block_size;
|
||||
}
|
||||
if (dsk.meta_device == dsk.data_device)
|
||||
{
|
||||
disable_meta_fsync = disable_data_fsync;
|
||||
|
@@ -148,10 +148,14 @@ int blockstore_impl_t::fulfill_read(blockstore_op_t *read_op,
|
||||
return r;
|
||||
}
|
||||
|
||||
uint8_t* blockstore_impl_t::get_clean_entry_bitmap(uint64_t block_loc, int offset)
|
||||
uint8_t* blockstore_impl_t::get_clean_entry_bitmap(blockstore_clean_db_t::iterator clean_it, int offset)
|
||||
{
|
||||
if (dsk.meta_format == BLOCKSTORE_META_FORMAT_HEAP)
|
||||
{
|
||||
return ((uint8_t*)&clean_it->second) + sizeof(clean_entry_heap_t) + offset;
|
||||
}
|
||||
uint8_t *clean_entry_bitmap;
|
||||
uint64_t meta_loc = block_loc >> dsk.block_order;
|
||||
uint64_t meta_loc = clean_it->second.location >> dsk.block_order;
|
||||
if (inmemory_meta)
|
||||
{
|
||||
uint64_t sector = (meta_loc / (dsk.meta_block_size / dsk.clean_entry_size)) * dsk.meta_block_size;
|
||||
@@ -159,7 +163,9 @@ uint8_t* blockstore_impl_t::get_clean_entry_bitmap(uint64_t block_loc, int offse
|
||||
clean_entry_bitmap = ((uint8_t*)metadata_buffer + sector + pos*dsk.clean_entry_size + sizeof(clean_disk_entry) + offset);
|
||||
}
|
||||
else
|
||||
{
|
||||
clean_entry_bitmap = (uint8_t*)(clean_bitmaps + meta_loc*2*dsk.clean_entry_bitmap_size + offset);
|
||||
}
|
||||
return clean_entry_bitmap;
|
||||
}
|
||||
|
||||
@@ -433,7 +439,7 @@ int blockstore_impl_t::dequeue_read(blockstore_op_t *read_op)
|
||||
if (!IS_JOURNAL(dirty.state))
|
||||
{
|
||||
// Read from data disk, possibly checking checksums
|
||||
if (!fulfill_clean_read(read_op, fulfilled, bmp_ptr, dyn_data,
|
||||
if (!fulfill_clean_read_journal(read_op, fulfilled, bmp_ptr, dyn_data,
|
||||
dirty.offset, dirty.offset+dirty.len, dirty.location, dirty_it->first.version))
|
||||
{
|
||||
goto undo_read;
|
||||
@@ -464,14 +470,13 @@ int blockstore_impl_t::dequeue_read(blockstore_op_t *read_op)
|
||||
result_version = clean_it->second.version;
|
||||
if (read_op->bitmap)
|
||||
{
|
||||
void *bmp_ptr = get_clean_entry_bitmap(clean_it->second.location, dsk.clean_entry_bitmap_size);
|
||||
void *bmp_ptr = get_clean_entry_bitmap(clean_it, dsk.clean_entry_bitmap_size);
|
||||
memcpy(read_op->bitmap, bmp_ptr, dsk.clean_entry_bitmap_size);
|
||||
}
|
||||
}
|
||||
if (fulfilled < read_op->len)
|
||||
{
|
||||
if (!fulfill_clean_read(read_op, fulfilled, NULL, NULL, 0, dsk.data_block_size,
|
||||
clean_it->second.location, clean_it->second.version))
|
||||
if (!fulfill_clean_read_meta(read_op, fulfilled, clean_it))
|
||||
{
|
||||
goto undo_read;
|
||||
}
|
||||
@@ -581,40 +586,22 @@ int blockstore_impl_t::pad_journal_read(std::vector<copy_buffer_t> & rv, copy_bu
|
||||
return 0;
|
||||
}
|
||||
|
||||
bool blockstore_impl_t::fulfill_clean_read(blockstore_op_t *read_op, uint64_t & fulfilled,
|
||||
bool blockstore_impl_t::fulfill_clean_read_journal(blockstore_op_t *read_op, uint64_t & fulfilled,
|
||||
uint8_t *clean_entry_bitmap, int *dyn_data, uint32_t item_start, uint32_t item_end, uint64_t clean_loc, uint64_t clean_ver)
|
||||
{
|
||||
bool from_journal = clean_entry_bitmap != NULL;
|
||||
if (!clean_entry_bitmap)
|
||||
{
|
||||
// NULL clean_entry_bitmap means we're reading from data, not from the journal,
|
||||
// and the bitmap location is obvious
|
||||
clean_entry_bitmap = get_clean_entry_bitmap(clean_loc, 0);
|
||||
}
|
||||
if (dsk.csum_block_size > dsk.bitmap_granularity)
|
||||
{
|
||||
auto & rv = PRIV(read_op)->read_vec;
|
||||
int req = fill_partial_checksum_blocks(rv, fulfilled, clean_entry_bitmap, dyn_data, from_journal,
|
||||
int req = fill_partial_checksum_blocks(rv, fulfilled, clean_entry_bitmap, dyn_data, true,
|
||||
(uint8_t*)read_op->buf, read_op->offset, read_op->offset+read_op->len);
|
||||
if (!inmemory_meta && !from_journal && req > 0)
|
||||
{
|
||||
// Read checksums from disk
|
||||
uint8_t *csum_buf = read_clean_meta_block(read_op, clean_loc, rv.size()-req);
|
||||
for (int i = req; i > 0; i--)
|
||||
{
|
||||
rv[rv.size()-i].csum_buf = csum_buf;
|
||||
}
|
||||
}
|
||||
for (int i = req; i > 0; i--)
|
||||
{
|
||||
if (!read_checksum_block(read_op, i, fulfilled, clean_loc))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
}
|
||||
PRIV(read_op)->clean_block_used = req > 0;
|
||||
}
|
||||
else if (from_journal)
|
||||
else
|
||||
{
|
||||
// Don't scan bitmap - journal writes don't have holes (internal bitmap)!
|
||||
uint8_t *csum = !dsk.csum_block_size ? 0 : (clean_entry_bitmap + dsk.clean_entry_bitmap_size +
|
||||
@@ -635,6 +622,43 @@ bool blockstore_impl_t::fulfill_clean_read(blockstore_op_t *read_op, uint64_t &
|
||||
assert(fulfill_read(read_op, fulfilled, item_end, dsk.data_block_size, (BS_ST_DELETE | BS_ST_STABLE), 0, 0, 0, NULL, NULL));
|
||||
}
|
||||
}
|
||||
// Increment reference counter if clean data is being read from the disk
|
||||
if (PRIV(read_op)->clean_block_used)
|
||||
{
|
||||
auto & uo = used_clean_objects[clean_loc];
|
||||
uo.refs++;
|
||||
if (dsk.csum_block_size && flusher->is_mutated(clean_loc))
|
||||
uo.was_changed = true;
|
||||
PRIV(read_op)->clean_block_used = clean_loc;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
bool blockstore_impl_t::fulfill_clean_read_meta(blockstore_op_t *read_op, uint64_t & fulfilled, blockstore_clean_db_t::iterator clean_it)
|
||||
{
|
||||
uint8_t *clean_entry_bitmap = get_clean_entry_bitmap(clean_it, 0);
|
||||
uint64_t clean_loc = clean_it->second.location;
|
||||
if (dsk.csum_block_size > dsk.bitmap_granularity)
|
||||
{
|
||||
auto & rv = PRIV(read_op)->read_vec;
|
||||
int req = fill_partial_checksum_blocks(rv, fulfilled, clean_entry_bitmap, NULL, false,
|
||||
(uint8_t*)read_op->buf, read_op->offset, read_op->offset+read_op->len);
|
||||
if (!inmemory_meta && req > 0)
|
||||
{
|
||||
// Read checksums from disk
|
||||
uint8_t *csum_buf = read_clean_meta_block(read_op, clean_it, rv.size()-req);
|
||||
for (int i = req; i > 0; i--)
|
||||
{
|
||||
rv[rv.size()-i].csum_buf = csum_buf;
|
||||
}
|
||||
}
|
||||
for (int i = req; i > 0; i--)
|
||||
{
|
||||
if (!read_checksum_block(read_op, i, fulfilled, clean_loc))
|
||||
return false;
|
||||
}
|
||||
PRIV(read_op)->clean_block_used = req > 0;
|
||||
}
|
||||
else
|
||||
{
|
||||
bool csum_done = !dsk.csum_block_size || inmemory_meta;
|
||||
@@ -662,13 +686,13 @@ bool blockstore_impl_t::fulfill_clean_read(blockstore_op_t *read_op, uint64_t &
|
||||
if (!csum_done)
|
||||
{
|
||||
// Read checksums from disk
|
||||
csum_buf = read_clean_meta_block(read_op, clean_loc, PRIV(read_op)->read_vec.size());
|
||||
csum_buf = read_clean_meta_block(read_op, clean_it, PRIV(read_op)->read_vec.size());
|
||||
csum_done = true;
|
||||
}
|
||||
uint8_t *csum = !dsk.csum_block_size ? 0 : (csum_buf + 2*dsk.clean_entry_bitmap_size + bmp_start*(dsk.data_csum_type & 0xFF));
|
||||
if (!fulfill_read(read_op, fulfilled, bmp_start * dsk.bitmap_granularity,
|
||||
bmp_end * dsk.bitmap_granularity, (BS_ST_BIG_WRITE | BS_ST_STABLE), 0,
|
||||
clean_loc + bmp_start * dsk.bitmap_granularity, 0, csum, dyn_data))
|
||||
clean_loc + bmp_start * dsk.bitmap_granularity, 0, csum, NULL))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
@@ -688,11 +712,22 @@ bool blockstore_impl_t::fulfill_clean_read(blockstore_op_t *read_op, uint64_t &
|
||||
return true;
|
||||
}
|
||||
|
||||
uint8_t* blockstore_impl_t::read_clean_meta_block(blockstore_op_t *op, uint64_t clean_loc, int rv_pos)
|
||||
uint8_t* blockstore_impl_t::read_clean_meta_block(blockstore_op_t *op, blockstore_clean_db_t::iterator clean_it, int rv_pos)
|
||||
{
|
||||
uint64_t sector, pos;
|
||||
auto & rv = PRIV(op)->read_vec;
|
||||
auto sector = ((clean_loc >> dsk.block_order) / (dsk.meta_block_size / dsk.clean_entry_size)) * dsk.meta_block_size;
|
||||
auto pos = ((clean_loc >> dsk.block_order) % (dsk.meta_block_size / dsk.clean_entry_size)) * dsk.clean_entry_size;
|
||||
if (dsk.meta_format == BLOCKSTORE_META_FORMAT_HEAP)
|
||||
{
|
||||
auto clean_heap_entry = (clean_entry_heap_t*)(&clean_it->second);
|
||||
sector = clean_heap_entry->meta_block * dsk.meta_block_size;
|
||||
pos = clean_heap_entry->block_offset;
|
||||
}
|
||||
else
|
||||
{
|
||||
auto clean_loc = clean_it->second.location;
|
||||
sector = ((clean_loc >> dsk.block_order) / (dsk.meta_block_size / dsk.clean_entry_size)) * dsk.meta_block_size;
|
||||
pos = ((clean_loc >> dsk.block_order) % (dsk.meta_block_size / dsk.clean_entry_size)) * dsk.clean_entry_size;
|
||||
}
|
||||
uint8_t *buf = (uint8_t*)memalign_or_die(MEM_ALIGNMENT, dsk.meta_block_size);
|
||||
rv.insert(rv.begin()+rv_pos, (copy_buffer_t){
|
||||
.copy_flags = COPY_BUF_META_BLOCK|COPY_BUF_CSUM_FILL,
|
||||
@@ -807,11 +842,6 @@ bool blockstore_impl_t::verify_clean_padded_checksums(blockstore_op_t *op, uint6
|
||||
if (from_journal)
|
||||
return verify_padded_checksums(dyn_data, dyn_data + dsk.clean_entry_bitmap_size, offset, iov, n_iov, bad_block_cb);
|
||||
clean_loc = (clean_loc >> dsk.block_order) << dsk.block_order;
|
||||
if (!dyn_data)
|
||||
{
|
||||
assert(inmemory_meta);
|
||||
dyn_data = get_clean_entry_bitmap(clean_loc, 0);
|
||||
}
|
||||
return verify_padded_checksums(dyn_data, dyn_data + 2*dsk.clean_entry_bitmap_size, offset, iov, n_iov, bad_block_cb);
|
||||
}
|
||||
|
||||
@@ -869,8 +899,18 @@ void blockstore_impl_t::handle_read_event(ring_data_t *data, blockstore_op_t *op
|
||||
auto & uo = used_clean_objects.at((rv[i].disk_offset >> dsk.block_order) << dsk.block_order);
|
||||
if (!uo.was_changed)
|
||||
{
|
||||
bool from_journal = (rv[i].copy_flags & COPY_BUF_JOURNALED_BIG);
|
||||
auto csum_buf = rv[i].csum_buf;
|
||||
if (!from_journal && !csum_buf)
|
||||
{
|
||||
assert(inmemory_meta);
|
||||
auto & clean_db = clean_db_shard(op->oid);
|
||||
auto clean_it = clean_db.find(op->oid);
|
||||
assert(clean_it != clean_db.end());
|
||||
csum_buf = get_clean_entry_bitmap(clean_it, 0);
|
||||
}
|
||||
verify_clean_padded_checksums(
|
||||
op, rv[i].disk_offset, rv[i].csum_buf, (rv[i].copy_flags & COPY_BUF_JOURNALED_BIG), iov, n_iov,
|
||||
op, rv[i].disk_offset, csum_buf, from_journal, iov, n_iov,
|
||||
[&](uint32_t bad_block, uint32_t calc_csum, uint32_t stored_csum)
|
||||
{
|
||||
ok = false;
|
||||
@@ -1019,7 +1059,7 @@ int blockstore_impl_t::read_bitmap(object_id oid, uint64_t target_version, void
|
||||
*result_version = clean_it->second.version;
|
||||
if (bitmap)
|
||||
{
|
||||
void *bmp_ptr = get_clean_entry_bitmap(clean_it->second.location, dsk.clean_entry_bitmap_size);
|
||||
void *bmp_ptr = get_clean_entry_bitmap(clean_it, dsk.clean_entry_bitmap_size);
|
||||
memcpy(bitmap, bmp_ptr, dsk.clean_entry_bitmap_size);
|
||||
}
|
||||
return 0;
|
||||
|
@@ -57,7 +57,7 @@ bool blockstore_impl_t::enqueue_write(blockstore_op_t *op)
|
||||
version = clean_it->second.version + 1;
|
||||
if (!is_del)
|
||||
{
|
||||
void *bmp_ptr = get_clean_entry_bitmap(clean_it->second.location, dsk.clean_entry_bitmap_size);
|
||||
void *bmp_ptr = get_clean_entry_bitmap(clean_it, dsk.clean_entry_bitmap_size);
|
||||
memcpy(dyn_ptr, bmp_ptr, dsk.clean_entry_bitmap_size);
|
||||
}
|
||||
}
|
||||
@@ -341,7 +341,7 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
|
||||
cancel_all_writes(op, dirty_it, -ENOSPC);
|
||||
return 2;
|
||||
}
|
||||
if (inmemory_meta)
|
||||
if (inmemory_meta && dsk.meta_format != BLOCKSTORE_META_FORMAT_HEAP)
|
||||
{
|
||||
// Check once more that metadata entry is zeroed (the reverse means a bug or corruption)
|
||||
uint64_t sector = (loc / (dsk.meta_block_size / dsk.clean_entry_size)) * dsk.meta_block_size;
|
||||
|
@@ -188,7 +188,7 @@ void osd_messenger_t::init()
|
||||
auto cl = cl_it->second;
|
||||
cl_it++;
|
||||
auto peer_fd = cl->peer_fd;
|
||||
if (!cl->osd_num && !cl->in_osd_num || cl->peer_state != PEER_CONNECTED && cl->peer_state != PEER_RDMA)
|
||||
if (!cl->osd_num || cl->peer_state != PEER_CONNECTED && cl->peer_state != PEER_RDMA)
|
||||
{
|
||||
// Do not run keepalive on regular clients
|
||||
continue;
|
||||
@@ -199,7 +199,7 @@ void osd_messenger_t::init()
|
||||
if (!cl->ping_time_remaining)
|
||||
{
|
||||
// Ping timed out, stop the client
|
||||
fprintf(stderr, "Ping timed out for OSD %ju (client %d), disconnecting peer\n", cl->in_osd_num ? cl->in_osd_num : cl->osd_num, cl->peer_fd);
|
||||
fprintf(stderr, "Ping timed out for OSD %ju (client %d), disconnecting peer\n", cl->osd_num, cl->peer_fd);
|
||||
stop_client(peer_fd, true);
|
||||
// Restart iterator because it may be invalidated
|
||||
cl_it = clients.upper_bound(peer_fd);
|
||||
@@ -230,7 +230,7 @@ void osd_messenger_t::init()
|
||||
return;
|
||||
}
|
||||
int fail_fd = (op->reply.hdr.retval != 0 ? op->peer_fd : -1);
|
||||
auto fail_osd_num = cl->in_osd_num ? cl->in_osd_num : cl->osd_num;
|
||||
auto fail_osd_num = cl->osd_num;
|
||||
cl->ping_time_remaining = 0;
|
||||
delete op;
|
||||
if (fail_fd >= 0)
|
||||
|
@@ -60,7 +60,6 @@ struct osd_client_t
|
||||
int ping_time_remaining = 0;
|
||||
int idle_time_remaining = 0;
|
||||
osd_num_t osd_num = 0;
|
||||
osd_num_t in_osd_num = 0;
|
||||
bool is_incoming = false;
|
||||
|
||||
void *in_buf = NULL;
|
||||
@@ -99,7 +98,6 @@ struct osd_client_t
|
||||
std::vector<osd_op_t*> zc_free_list;
|
||||
|
||||
~osd_client_t();
|
||||
void cancel_ops();
|
||||
};
|
||||
|
||||
struct osd_wanted_peer_t
|
||||
@@ -237,7 +235,6 @@ public:
|
||||
void outbox_push(osd_op_t *cur_op);
|
||||
std::function<void(osd_op_t*)> exec_op;
|
||||
std::function<void(osd_num_t)> repeer_pgs;
|
||||
std::function<void(osd_num_t)> break_pg_locks;
|
||||
std::function<bool(osd_client_t*, json11::Json)> check_config_hook;
|
||||
void read_requests();
|
||||
void send_replies();
|
||||
|
@@ -173,7 +173,6 @@ struct osd_op_t
|
||||
osd_op_buf_list_t iov;
|
||||
|
||||
~osd_op_t();
|
||||
void cancel();
|
||||
|
||||
bool is_recovery_related();
|
||||
};
|
||||
|
@@ -510,12 +510,13 @@ void osd_messenger_t::rdmacm_established(rdma_cm_event *ev)
|
||||
rc->qp = conn->cmid->qp;
|
||||
// And an osd_client_t
|
||||
auto cl = new osd_client_t();
|
||||
cl->is_incoming = true;
|
||||
cl->peer_addr = conn->parsed_addr;
|
||||
cl->peer_port = conn->rdmacm_port;
|
||||
cl->peer_fd = conn->peer_fd;
|
||||
cl->peer_state = PEER_RDMA;
|
||||
cl->connect_timeout_id = -1;
|
||||
cl->in_osd_num = peer_osd;
|
||||
cl->osd_num = peer_osd;
|
||||
cl->in_buf = malloc_or_die(receive_buffer_size);
|
||||
cl->rdma_conn = rc;
|
||||
clients[conn->peer_fd] = cl;
|
||||
|
@@ -8,12 +8,11 @@ void osd_messenger_t::read_requests()
|
||||
for (int i = 0; i < read_ready_clients.size(); i++)
|
||||
{
|
||||
int peer_fd = read_ready_clients[i];
|
||||
auto cl_it = clients.find(peer_fd);
|
||||
if (cl_it == clients.end() || !cl_it->second || cl_it->second->read_msg.msg_iovlen)
|
||||
osd_client_t *cl = clients[peer_fd];
|
||||
if (cl->read_msg.msg_iovlen)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
auto cl = cl_it->second;
|
||||
if (cl->read_remaining < receive_buffer_size)
|
||||
{
|
||||
cl->read_iov.iov_base = cl->in_buf;
|
||||
@@ -34,12 +33,8 @@ void osd_messenger_t::read_requests()
|
||||
auto iothread = iothreads.size() ? iothreads[peer_fd % iothreads.size()] : NULL;
|
||||
io_uring_sqe sqe_local;
|
||||
ring_data_t data_local;
|
||||
sqe_local.user_data = (uint64_t)&data_local;
|
||||
io_uring_sqe* sqe = (iothread ? &sqe_local : ringloop->get_sqe());
|
||||
if (iothread)
|
||||
{
|
||||
sqe_local = { .user_data = (uint64_t)&data_local };
|
||||
data_local = {};
|
||||
}
|
||||
if (!sqe)
|
||||
{
|
||||
cl->read_msg.msg_iovlen = 0;
|
||||
@@ -61,8 +56,7 @@ void osd_messenger_t::read_requests()
|
||||
{
|
||||
result = -errno;
|
||||
}
|
||||
// like set_immediate
|
||||
tfd->set_timer_us(0, false, [this, result, cl](int){ handle_read(result, cl); });
|
||||
handle_read(result, cl);
|
||||
}
|
||||
}
|
||||
read_ready_clients.clear();
|
||||
@@ -234,7 +228,7 @@ bool osd_messenger_t::handle_finished_read(osd_client_t *cl)
|
||||
{
|
||||
if (cl->read_op->req.hdr.id != cl->read_op_id)
|
||||
{
|
||||
fprintf(stderr, "Warning: operation sequencing is broken on client %d: expected num %ju, got %ju, stopping client\n", cl->peer_fd, cl->read_op_id, cl->read_op->req.hdr.id);
|
||||
fprintf(stderr, "Warning: operation sequencing is broken on client %d, stopping client\n", cl->peer_fd);
|
||||
stop_client(cl->peer_fd);
|
||||
return false;
|
||||
}
|
||||
|
@@ -194,14 +194,12 @@ bool osd_messenger_t::try_send(osd_client_t *cl)
|
||||
auto iothread = iothreads.size() ? iothreads[peer_fd % iothreads.size()] : NULL;
|
||||
io_uring_sqe sqe_local;
|
||||
ring_data_t data_local;
|
||||
sqe_local.user_data = (uint64_t)&data_local;
|
||||
io_uring_sqe* sqe = (iothread ? &sqe_local : ringloop->get_sqe());
|
||||
if (iothread)
|
||||
{
|
||||
sqe_local = { .user_data = (uint64_t)&data_local };
|
||||
data_local = {};
|
||||
}
|
||||
if (!sqe)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
cl->write_msg.msg_iov = cl->send_list.data();
|
||||
cl->write_msg.msg_iovlen = cl->send_list.size() < IOV_MAX ? cl->send_list.size() : IOV_MAX;
|
||||
cl->refs++;
|
||||
@@ -239,8 +237,7 @@ bool osd_messenger_t::try_send(osd_client_t *cl)
|
||||
{
|
||||
result = -errno;
|
||||
}
|
||||
// like set_immediate
|
||||
tfd->set_timer_us(0, false, [this, result, cl](int){ handle_send(result, false, false, cl); });
|
||||
handle_send(result, false, false, cl);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
@@ -9,37 +9,38 @@
|
||||
#include "msgr_rdma.h"
|
||||
#endif
|
||||
|
||||
void osd_client_t::cancel_ops()
|
||||
void osd_messenger_t::cancel_osd_ops(osd_client_t *cl)
|
||||
{
|
||||
std::vector<osd_op_t*> cancel_ops;
|
||||
cancel_ops.resize(sent_ops.size());
|
||||
cancel_ops.resize(cl->sent_ops.size());
|
||||
int i = 0;
|
||||
for (auto p: sent_ops)
|
||||
for (auto p: cl->sent_ops)
|
||||
{
|
||||
cancel_ops[i++] = p.second;
|
||||
}
|
||||
sent_ops.clear();
|
||||
cl->sent_ops.clear();
|
||||
cl->outbox.clear();
|
||||
for (auto op: cancel_ops)
|
||||
{
|
||||
op->cancel();
|
||||
cancel_op(op);
|
||||
}
|
||||
}
|
||||
|
||||
void osd_op_t::cancel()
|
||||
void osd_messenger_t::cancel_op(osd_op_t *op)
|
||||
{
|
||||
if (op_type == OSD_OP_OUT && callback)
|
||||
if (op->op_type == OSD_OP_OUT)
|
||||
{
|
||||
reply.hdr.magic = SECONDARY_OSD_REPLY_MAGIC;
|
||||
reply.hdr.id = req.hdr.id;
|
||||
reply.hdr.opcode = req.hdr.opcode;
|
||||
reply.hdr.retval = -EPIPE;
|
||||
// Copy lambda to be unaffected by `delete this`
|
||||
(std::function<void(osd_op_t*)>(callback))(this);
|
||||
op->reply.hdr.magic = SECONDARY_OSD_REPLY_MAGIC;
|
||||
op->reply.hdr.id = op->req.hdr.id;
|
||||
op->reply.hdr.opcode = op->req.hdr.opcode;
|
||||
op->reply.hdr.retval = -EPIPE;
|
||||
// Copy lambda to be unaffected by `delete op`
|
||||
std::function<void(osd_op_t*)>(op->callback)(op);
|
||||
}
|
||||
else
|
||||
{
|
||||
// This function is only called in stop_client(), so it's fine to destroy the operation
|
||||
delete this;
|
||||
delete op;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -62,10 +63,6 @@ void osd_messenger_t::stop_client(int peer_fd, bool force, bool force_delete)
|
||||
{
|
||||
fprintf(stderr, "[OSD %ju] Stopping client %d (OSD peer %ju)\n", osd_num, peer_fd, cl->osd_num);
|
||||
}
|
||||
else if (cl->in_osd_num)
|
||||
{
|
||||
fprintf(stderr, "[OSD %ju] Stopping client %d (incoming OSD peer %ju)\n", osd_num, peer_fd, cl->in_osd_num);
|
||||
}
|
||||
else
|
||||
{
|
||||
fprintf(stderr, "[OSD %ju] Stopping client %d (regular client)\n", osd_num, peer_fd);
|
||||
@@ -76,12 +73,8 @@ void osd_messenger_t::stop_client(int peer_fd, bool force, bool force_delete)
|
||||
cl->peer_state = PEER_STOPPED;
|
||||
if (cl->osd_num)
|
||||
{
|
||||
auto osd_it = osd_peer_fds.find(cl->osd_num);
|
||||
if (osd_it != osd_peer_fds.end() && osd_it->second == cl->peer_fd)
|
||||
{
|
||||
// ...and forget OSD peer
|
||||
osd_peer_fds.erase(osd_it);
|
||||
}
|
||||
// ...and forget OSD peer
|
||||
osd_peer_fds.erase(cl->osd_num);
|
||||
}
|
||||
#ifndef __MOCK__
|
||||
// Then remove FD from the eventloop so we don't accidentally read something
|
||||
@@ -108,17 +101,30 @@ void osd_messenger_t::stop_client(int peer_fd, bool force, bool force_delete)
|
||||
}
|
||||
}
|
||||
#endif
|
||||
if (cl->in_osd_num && break_pg_locks)
|
||||
{
|
||||
// Break PG locks
|
||||
break_pg_locks(cl->in_osd_num);
|
||||
}
|
||||
if (cl->osd_num)
|
||||
{
|
||||
// Then repeer PGs because cancel_op() callbacks can try to perform
|
||||
// some actions and we need correct PG states to not do something silly
|
||||
repeer_pgs(cl->osd_num);
|
||||
}
|
||||
// Then cancel all operations
|
||||
if (cl->read_op)
|
||||
{
|
||||
if (!cl->read_op->callback)
|
||||
{
|
||||
delete cl->read_op;
|
||||
}
|
||||
else
|
||||
{
|
||||
cancel_op(cl->read_op);
|
||||
}
|
||||
cl->read_op = NULL;
|
||||
}
|
||||
if (cl->osd_num)
|
||||
{
|
||||
// Cancel outbound operations
|
||||
cancel_osd_ops(cl);
|
||||
}
|
||||
// Find the item again because it can be invalidated at this point
|
||||
it = clients.find(peer_fd);
|
||||
if (it != clients.end())
|
||||
@@ -143,17 +149,6 @@ osd_client_t::~osd_client_t()
|
||||
close(peer_fd);
|
||||
peer_fd = -1;
|
||||
}
|
||||
// Then cancel all operations
|
||||
// Operations have to be canceled only after clearing all references to osd_client_t
|
||||
// because otherwise their buffers may be still present in io_uring asynchronous requests
|
||||
if (read_op)
|
||||
{
|
||||
// read_op may be an incoming op or a continued response for an outbound op
|
||||
read_op->cancel();
|
||||
read_op = NULL;
|
||||
}
|
||||
// Cancel outbound ops
|
||||
cancel_ops();
|
||||
#ifndef __MOCK__
|
||||
#ifdef WITH_RDMA
|
||||
if (rdma_conn)
|
||||
|
@@ -6,7 +6,7 @@ includedir=${prefix}/@CMAKE_INSTALL_INCLUDEDIR@
|
||||
|
||||
Name: Vitastor
|
||||
Description: Vitastor client library
|
||||
Version: 2.2.2
|
||||
Version: 2.2.0
|
||||
Libs: -L${libdir} -lvitastor_client
|
||||
Cflags: -I${includedir}
|
||||
|
||||
|
@@ -70,7 +70,7 @@ struct rm_osd_t
|
||||
{
|
||||
if (parent->cli->st_cli.peer_states.find(osd_id) != parent->cli->st_cli.peer_states.end())
|
||||
{
|
||||
is_warning = !allow_up;
|
||||
is_warning = true;
|
||||
still_up.push_back(osd_id);
|
||||
}
|
||||
}
|
||||
|
@@ -85,7 +85,6 @@ osd_t::osd_t(const json11::Json & config, ring_loop_t *ringloop)
|
||||
msgr.ringloop = this->ringloop;
|
||||
msgr.exec_op = [this](osd_op_t *op) { exec_op(op); };
|
||||
msgr.repeer_pgs = [this](osd_num_t peer_osd) { repeer_pgs(peer_osd); };
|
||||
msgr.break_pg_locks = [this](osd_num_t peer_osd) { break_pg_locks(peer_osd); };
|
||||
msgr.check_config_hook = [this](osd_client_t *cl, json11::Json conf) { return check_peer_config(cl, conf); };
|
||||
msgr.init();
|
||||
|
||||
|
@@ -278,8 +278,6 @@ class osd_t
|
||||
void handle_peers();
|
||||
bool check_peer_config(osd_client_t *cl, json11::Json conf);
|
||||
void repeer_pgs(osd_num_t osd_num);
|
||||
void repeer_pg(pg_t & pg);
|
||||
void break_pg_locks(osd_num_t osd_num);
|
||||
void start_pg_peering(pg_t & pg);
|
||||
void drop_dirty_pg_connections(pool_pg_num_t pg);
|
||||
void record_pg_lock(pg_t & pg, osd_num_t peer_osd, uint64_t pg_state);
|
||||
|
@@ -432,16 +432,9 @@ void osd_t::apply_pg_locks_localize_only()
|
||||
}
|
||||
auto & pool_cfg = pool_it->second;
|
||||
auto & pg = pp.second;
|
||||
auto old_disable_pg_locks = pg.disable_pg_locks;
|
||||
pg.disable_pg_locks = pg_locks_localize_only &&
|
||||
(pool_cfg.scheme != POOL_SCHEME_REPLICATED ||
|
||||
pool_cfg.local_reads == POOL_LOCAL_READ_PRIMARY);
|
||||
if (!pg.disable_pg_locks && old_disable_pg_locks)
|
||||
{
|
||||
// Relock PG
|
||||
printf("[PG %u/%u] Repeer to enable PG locks\n", pg.pool_id, pg.pg_num);
|
||||
repeer_pg(pg);
|
||||
}
|
||||
pool_cfg.scheme == POOL_SCHEME_REPLICATED &&
|
||||
pool_cfg.local_reads == POOL_LOCAL_READ_PRIMARY;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -884,8 +877,8 @@ void osd_t::apply_pg_config()
|
||||
pg.next_scrub = pg_cfg.next_scrub;
|
||||
pg.target_set = pg_cfg.target_set;
|
||||
pg.disable_pg_locks = pg_locks_localize_only &&
|
||||
(pool_item.second.scheme != POOL_SCHEME_REPLICATED ||
|
||||
pool_item.second.local_reads == POOL_LOCAL_READ_PRIMARY);
|
||||
pool_item.second.scheme == POOL_SCHEME_REPLICATED &&
|
||||
pool_item.second.local_reads == POOL_LOCAL_READ_PRIMARY;
|
||||
if (pg.scheme == POOL_SCHEME_EC)
|
||||
{
|
||||
use_ec(pg.pg_size, pg.pg_data_size, true);
|
||||
@@ -1051,15 +1044,8 @@ void osd_t::report_pg_states()
|
||||
etcd_reporting_pg_state = true;
|
||||
st_cli.etcd_txn(json11::Json::object {
|
||||
{ "compare", checks }, { "success", success }, { "failure", failure }
|
||||
}, st_cli.etcd_quick_timeout, 0, 0, [this, reporting_pgs, success_count = success.size(), failure_count = failure.size()](std::string err, json11::Json data)
|
||||
}, st_cli.etcd_quick_timeout, 0, 0, [this, reporting_pgs](std::string err, json11::Json data)
|
||||
{
|
||||
int expected_count = (data["succeeded"].bool_value() ? success_count : failure_count);
|
||||
if (expected_count != data["responses"].array_items().size())
|
||||
{
|
||||
printf("Unexpected response from etcd - 'responses' count (%u) isn't equal to expected (%u), stopping\n",
|
||||
data["responses"].array_items().size(), expected_count);
|
||||
force_stop(1);
|
||||
}
|
||||
etcd_reporting_pg_state = false;
|
||||
if (!data["succeeded"].bool_value())
|
||||
{
|
||||
|
@@ -73,25 +73,18 @@ void osd_t::handle_peers()
|
||||
}
|
||||
}
|
||||
|
||||
void osd_t::break_pg_locks(osd_num_t peer_osd)
|
||||
{
|
||||
for (auto lock_it = pg_locks.begin(); lock_it != pg_locks.end(); )
|
||||
{
|
||||
if (lock_it->second.primary_osd == peer_osd)
|
||||
{
|
||||
if (log_level > 3)
|
||||
{
|
||||
printf("Break PG %u/%u lock on disconnection of OSD %ju\n", lock_it->first.pool_id, lock_it->first.pg_num, peer_osd);
|
||||
}
|
||||
pg_locks.erase(lock_it++);
|
||||
}
|
||||
else
|
||||
lock_it++;
|
||||
}
|
||||
}
|
||||
|
||||
void osd_t::repeer_pgs(osd_num_t peer_osd)
|
||||
{
|
||||
if (msgr.osd_peer_fds.find(peer_osd) == msgr.osd_peer_fds.end())
|
||||
{
|
||||
for (auto lock_it = pg_locks.begin(); lock_it != pg_locks.end(); )
|
||||
{
|
||||
if (lock_it->second.primary_osd == peer_osd)
|
||||
pg_locks.erase(lock_it++);
|
||||
else
|
||||
lock_it++;
|
||||
}
|
||||
}
|
||||
// Re-peer affected PGs
|
||||
for (auto & p: pgs)
|
||||
{
|
||||
@@ -111,26 +104,21 @@ void osd_t::repeer_pgs(osd_num_t peer_osd)
|
||||
{
|
||||
// Repeer this pg
|
||||
printf("[PG %u/%u] Repeer because of OSD %ju\n", pg.pool_id, pg.pg_num, peer_osd);
|
||||
repeer_pg(pg);
|
||||
if (!(pg.state & (PG_ACTIVE | PG_REPEERING)) || pg.can_repeer())
|
||||
{
|
||||
start_pg_peering(pg);
|
||||
}
|
||||
else
|
||||
{
|
||||
// Stop accepting new operations, wait for current ones to finish or fail
|
||||
pg.state = pg.state & ~PG_ACTIVE | PG_REPEERING;
|
||||
report_pg_state(pg);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void osd_t::repeer_pg(pg_t & pg)
|
||||
{
|
||||
if (!(pg.state & (PG_ACTIVE | PG_REPEERING)) || pg.can_repeer())
|
||||
{
|
||||
start_pg_peering(pg);
|
||||
}
|
||||
else
|
||||
{
|
||||
// Stop accepting new operations, wait for current ones to finish or fail
|
||||
pg.state = pg.state & ~PG_ACTIVE | PG_REPEERING;
|
||||
report_pg_state(pg);
|
||||
}
|
||||
}
|
||||
|
||||
// Reset PG state (when peering or stopping)
|
||||
void osd_t::reset_pg(pg_t & pg)
|
||||
{
|
||||
@@ -478,7 +466,6 @@ void osd_t::relock_pg(pg_t & pg)
|
||||
auto pg_it = pgs.find(pg_id);
|
||||
if (pg_it == pgs.end())
|
||||
{
|
||||
printf("Warning: PG %u/%u is gone during lock attempt\n", pg_id.pool_id, pg_id.pg_num);
|
||||
return;
|
||||
}
|
||||
auto & pg = pg_it->second;
|
||||
|
@@ -417,17 +417,15 @@ void osd_t::handle_primary_subop(osd_op_t *subop, osd_op_t *cur_op)
|
||||
if (retval != expected)
|
||||
{
|
||||
int64_t peer_osd = (msgr.clients.find(subop->peer_fd) != msgr.clients.end()
|
||||
? msgr.clients[subop->peer_fd]->osd_num : 0);
|
||||
? msgr.clients[subop->peer_fd]->osd_num : -subop->peer_fd);
|
||||
if (opcode == OSD_OP_SEC_READ || opcode == OSD_OP_SEC_WRITE || opcode == OSD_OP_SEC_WRITE_STABLE)
|
||||
{
|
||||
printf(
|
||||
subop->peer_fd >= 0
|
||||
? (peer_osd > 0
|
||||
? "%1$s subop to %2$jx:%3$jx v%4$ju failed on osd %7$ju: retval = %5$d (expected %6$d)\n"
|
||||
: "%1$s subop to %2$jx:%3$jx v%4$ju failed on peer %8$d: retval = %5$d (expected %6$d)\n")
|
||||
? "%1$s subop to %2$jx:%3$jx v%4$ju failed on osd %7$jd: retval = %5$d (expected %6$d)\n"
|
||||
: "%1$s subop to %2$jx:%3$jx v%4$ju failed locally: retval = %5$d (expected %6$d)\n",
|
||||
osd_op_names[opcode], subop->req.sec_rw.oid.inode, subop->req.sec_rw.oid.stripe, subop->req.sec_rw.version,
|
||||
retval, expected, peer_osd, subop->peer_fd
|
||||
retval, expected, peer_osd
|
||||
);
|
||||
}
|
||||
else if (opcode == OSD_OP_SEC_DELETE)
|
||||
|
@@ -91,17 +91,16 @@ bool osd_t::sec_check_pg_lock(osd_num_t primary_osd, const object_id &oid)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
auto & pool_cfg = pool_cfg_it->second;
|
||||
if (pg_locks_localize_only && (pool_cfg.scheme != POOL_SCHEME_REPLICATED || pool_cfg.local_reads == POOL_LOCAL_READ_PRIMARY))
|
||||
{
|
||||
return true;
|
||||
}
|
||||
auto ppg = (pool_pg_num_t){ .pool_id = pool_id, .pg_num = map_to_pg(oid, pool_cfg_it->second.pg_stripe_size) };
|
||||
auto pg_it = pgs.find(ppg);
|
||||
if (pg_it != pgs.end() && pg_it->second.state != PG_OFFLINE)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
if (pg_it->second.disable_pg_locks)
|
||||
{
|
||||
return true;
|
||||
}
|
||||
auto lock_it = pg_locks.find(ppg);
|
||||
return lock_it != pg_locks.end() && lock_it->second.primary_osd == primary_osd;
|
||||
}
|
||||
@@ -141,7 +140,7 @@ void osd_t::exec_secondary_real(osd_op_t *cur_op)
|
||||
cur_op->req.hdr.opcode == OSD_OP_SEC_WRITE_STABLE)
|
||||
{
|
||||
if (!(cur_op->req.sec_rw.flags & OSD_OP_IGNORE_PG_LOCK) &&
|
||||
!sec_check_pg_lock(cl->in_osd_num, cur_op->req.sec_rw.oid))
|
||||
!sec_check_pg_lock(cl->osd_num, cur_op->req.sec_rw.oid))
|
||||
{
|
||||
cur_op->bs_op->retval = -EPIPE;
|
||||
secondary_op_callback(cur_op);
|
||||
@@ -170,7 +169,7 @@ void osd_t::exec_secondary_real(osd_op_t *cur_op)
|
||||
else if (cur_op->req.hdr.opcode == OSD_OP_SEC_DELETE)
|
||||
{
|
||||
if (!(cur_op->req.sec_del.flags & OSD_OP_IGNORE_PG_LOCK) &&
|
||||
!sec_check_pg_lock(cl->in_osd_num, cur_op->req.sec_del.oid))
|
||||
!sec_check_pg_lock(cl->osd_num, cur_op->req.sec_del.oid))
|
||||
{
|
||||
cur_op->bs_op->retval = -EPIPE;
|
||||
secondary_op_callback(cur_op);
|
||||
@@ -194,7 +193,7 @@ void osd_t::exec_secondary_real(osd_op_t *cur_op)
|
||||
{
|
||||
for (int i = 0; i < cur_op->bs_op->len; i++)
|
||||
{
|
||||
if (!sec_check_pg_lock(cl->in_osd_num, ((obj_ver_id*)cur_op->buf)[i].oid))
|
||||
if (!sec_check_pg_lock(cl->osd_num, ((obj_ver_id*)cur_op->buf)[i].oid))
|
||||
{
|
||||
cur_op->bs_op->retval = -EPIPE;
|
||||
secondary_op_callback(cur_op);
|
||||
@@ -248,7 +247,7 @@ void osd_t::exec_sec_read_bmp(osd_op_t *cur_op)
|
||||
void *cur_buf = reply_buf;
|
||||
for (int i = 0; i < n; i++)
|
||||
{
|
||||
if (!sec_check_pg_lock(cl->in_osd_num, ov[i].oid) &&
|
||||
if (!sec_check_pg_lock(cl->osd_num, ov[i].oid) &&
|
||||
!(cur_op->req.sec_read_bmp.flags & OSD_OP_IGNORE_PG_LOCK))
|
||||
{
|
||||
free(reply_buf);
|
||||
@@ -270,7 +269,7 @@ void osd_t::exec_sec_lock(osd_op_t *cur_op)
|
||||
{
|
||||
cur_op->reply.sec_lock.cur_primary = 0;
|
||||
auto cl = msgr.clients.at(cur_op->peer_fd);
|
||||
if (!cl->in_osd_num ||
|
||||
if (!cl->osd_num ||
|
||||
cur_op->req.sec_lock.flags != OSD_SEC_LOCK_PG &&
|
||||
cur_op->req.sec_lock.flags != OSD_SEC_UNLOCK_PG ||
|
||||
cur_op->req.sec_lock.pool_id > ((uint64_t)1<<POOL_ID_BITS) ||
|
||||
@@ -291,7 +290,7 @@ void osd_t::exec_sec_lock(osd_op_t *cur_op)
|
||||
auto lock_it = pg_locks.find(ppg);
|
||||
if (cur_op->req.sec_lock.flags == OSD_SEC_LOCK_PG)
|
||||
{
|
||||
if (lock_it != pg_locks.end() && lock_it->second.primary_osd != cl->in_osd_num)
|
||||
if (lock_it != pg_locks.end() && lock_it->second.primary_osd != cl->osd_num)
|
||||
{
|
||||
cur_op->reply.sec_lock.cur_primary = lock_it->second.primary_osd;
|
||||
finish_op(cur_op, -EBUSY);
|
||||
@@ -304,21 +303,13 @@ void osd_t::exec_sec_lock(osd_op_t *cur_op)
|
||||
finish_op(cur_op, -EBUSY);
|
||||
return;
|
||||
}
|
||||
if (log_level > 3)
|
||||
{
|
||||
printf("Lock PG %u/%u for OSD %ju\n", ppg.pool_id, ppg.pg_num, cl->in_osd_num);
|
||||
}
|
||||
pg_locks[ppg] = (osd_pg_lock_t){
|
||||
.primary_osd = cl->in_osd_num,
|
||||
.primary_osd = cl->osd_num,
|
||||
.state = cur_op->req.sec_lock.pg_state,
|
||||
};
|
||||
}
|
||||
else if (lock_it != pg_locks.end() && lock_it->second.primary_osd == cl->in_osd_num)
|
||||
else if (lock_it != pg_locks.end() && lock_it->second.primary_osd == cl->osd_num)
|
||||
{
|
||||
if (log_level > 3)
|
||||
{
|
||||
printf("Unlock PG %u/%u by OSD %ju\n", ppg.pool_id, ppg.pg_num, cl->in_osd_num);
|
||||
}
|
||||
pg_locks.erase(lock_it);
|
||||
}
|
||||
finish_op(cur_op, 0);
|
||||
@@ -332,7 +323,7 @@ void osd_t::exec_show_config(osd_op_t *cur_op)
|
||||
: json11::Json();
|
||||
auto peer_osd_num = req_json["osd_num"].uint64_value();
|
||||
auto cl = msgr.clients.at(cur_op->peer_fd);
|
||||
cl->in_osd_num = peer_osd_num;
|
||||
cl->osd_num = peer_osd_num;
|
||||
if (req_json["features"]["check_sequencing"].bool_value())
|
||||
{
|
||||
cl->check_sequencing = true;
|
||||
|
@@ -121,7 +121,6 @@ void pretend_connected(cluster_client_t *cli, osd_num_t osd_num)
|
||||
cli->msgr.osd_peer_fds[osd_num] = peer_fd;
|
||||
cli->msgr.clients[peer_fd] = new osd_client_t();
|
||||
cli->msgr.clients[peer_fd]->osd_num = osd_num;
|
||||
cli->msgr.clients[peer_fd]->peer_fd = peer_fd;
|
||||
cli->msgr.clients[peer_fd]->peer_state = PEER_CONNECTED;
|
||||
cli->msgr.wanted_peers.erase(osd_num);
|
||||
cli->msgr.repeer_pgs(osd_num);
|
||||
|
@@ -125,8 +125,6 @@ void ring_loop_t::loop()
|
||||
if (cqe->flags & IORING_CQE_F_MORE)
|
||||
{
|
||||
// There will be a second notification
|
||||
if (mt)
|
||||
mu.unlock();
|
||||
d->res = cqe->res;
|
||||
d->more = true;
|
||||
if (d->callback)
|
||||
|
@@ -59,7 +59,6 @@ SCHEME=ec IMMEDIATE_COMMIT=1 ./test_rebalance_verify.sh
|
||||
|
||||
./test_write.sh
|
||||
SCHEME=xor ./test_write.sh
|
||||
TEST_NAME=iothreads GLOBAL_CONFIG=',"client_iothread_count":4' ./test_write.sh
|
||||
|
||||
./test_write_no_same.sh
|
||||
|
||||
|
Reference in New Issue
Block a user