Compare commits
8 Commits
Author | SHA1 | Date |
---|---|---|
|
f201ecdd51 | |
|
4afb617f59 | |
|
d3fde0569f | |
|
438b64f6c3 | |
|
2b0a802ea1 | |
|
0dd49c1d67 | |
|
410170db96 | |
|
7d8523e0e5 |
.gitea/workflows
docs/intro
src
tests
|
@ -684,6 +684,24 @@ jobs:
|
||||||
echo ""
|
echo ""
|
||||||
done
|
done
|
||||||
|
|
||||||
|
test_write_iothreads:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
needs: build
|
||||||
|
container: ${{env.TEST_IMAGE}}:${{github.sha}}
|
||||||
|
steps:
|
||||||
|
- name: Run test
|
||||||
|
id: test
|
||||||
|
timeout-minutes: 3
|
||||||
|
run: TEST_NAME=iothreads GLOBAL_CONFIG=',"client_iothread_count":4' /root/vitastor/tests/test_write.sh
|
||||||
|
- name: Print logs
|
||||||
|
if: always() && steps.test.outcome == 'failure'
|
||||||
|
run: |
|
||||||
|
for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
|
||||||
|
echo "-------- $i --------"
|
||||||
|
cat $i
|
||||||
|
echo ""
|
||||||
|
done
|
||||||
|
|
||||||
test_write_no_same:
|
test_write_no_same:
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
needs: build
|
needs: build
|
||||||
|
|
|
@ -1 +1 @@
|
||||||
Subproject commit a21350e484cefa5728f23c227323b4b0822e738f
|
Subproject commit 8de8b467acbca50cfd8835c20e0e379110f3b32b
|
|
@ -10,8 +10,17 @@ Copyright (c) Vitaliy Filippov (vitalif [at] yourcmc.ru), 2019+
|
||||||
|
|
||||||
Join Vitastor Telegram Chat: https://t.me/vitastor
|
Join Vitastor Telegram Chat: https://t.me/vitastor
|
||||||
|
|
||||||
All server-side code (OSD, Monitor and so on) is licensed under the terms of
|
License: VNPL 1.1 for server-side code and dual VNPL 1.1 + GPL 2.0+ for client tools.
|
||||||
Vitastor Network Public License 1.1 (VNPL 1.1), a copyleft license based on
|
|
||||||
|
Server-side code is licensed only under the terms of VNPL.
|
||||||
|
|
||||||
|
Client libraries (cluster_client and so on) are dual-licensed under the same
|
||||||
|
VNPL 1.1 and also GNU GPL 2.0 or later to allow for compatibility with GPLed
|
||||||
|
software like QEMU and fio.
|
||||||
|
|
||||||
|
## VNPL
|
||||||
|
|
||||||
|
Vitastor Network Public License 1.1 (VNPL 1.1) is a copyleft license based on
|
||||||
GNU GPLv3.0 with the additional "Network Interaction" clause which requires
|
GNU GPLv3.0 with the additional "Network Interaction" clause which requires
|
||||||
opensourcing all programs directly or indirectly interacting with Vitastor
|
opensourcing all programs directly or indirectly interacting with Vitastor
|
||||||
through a computer network and expressly designed to be used in conjunction
|
through a computer network and expressly designed to be used in conjunction
|
||||||
|
@ -20,18 +29,83 @@ the terms of the same license, but also under the terms of any GPL-Compatible
|
||||||
Free Software License, as listed by the Free Software Foundation.
|
Free Software License, as listed by the Free Software Foundation.
|
||||||
This is a stricter copyleft license than the Affero GPL.
|
This is a stricter copyleft license than the Affero GPL.
|
||||||
|
|
||||||
Please note that VNPL doesn't require you to open the code of proprietary
|
The idea of VNPL is, in addition to modules linked to Vitastor code in a single
|
||||||
software running inside a VM if it's not specially designed to be used with
|
binary file, to extend copyleft action to micro-service modules only interacting
|
||||||
Vitastor.
|
with it over the network.
|
||||||
|
|
||||||
Basically, you can't use the software in a proprietary environment to provide
|
Basically, you can't use the software in a proprietary environment to provide
|
||||||
its functionality to users without opensourcing all intermediary components
|
its functionality to users without opensourcing all intermediary components
|
||||||
standing between the user and Vitastor or purchasing a commercial license
|
standing between the user and Vitastor or purchasing a commercial license
|
||||||
from the author 😀.
|
from the author 😀.
|
||||||
|
|
||||||
Client libraries (cluster_client and so on) are dual-licensed under the same
|
At the same time, VNPL doesn't impose any restrictions on software *not specially designed*
|
||||||
VNPL 1.1 and also GNU GPL 2.0 or later to allow for compatibility with GPLed
|
to be used with Vitastor, for example, on Windows running inside a VM with a Vitastor disk.
|
||||||
software like QEMU and fio.
|
|
||||||
|
|
||||||
You can find the full text of VNPL-1.1 in the file [VNPL-1.1.txt](../../VNPL-1.1.txt).
|
## Explanation
|
||||||
GPL 2.0 is also included in this repository as [GPL-2.0.txt](../../GPL-2.0.txt).
|
|
||||||
|
Network copyleft is governed by the clause **13. Remote Network Interaction** of VNPL.
|
||||||
|
|
||||||
|
A program is considered to be a "Proxy Program" if it meets both conditions:
|
||||||
|
- It is specially designed to be used with Vitastor. Basically, it means that the program
|
||||||
|
has any functionality specific to Vitastor and thus "knows" that it works with Vitastor,
|
||||||
|
not with something random.
|
||||||
|
- It interacts with Vitastor directly or indirectly through any programming interface,
|
||||||
|
including API, CLI, network or any wrapper (also considered a Proxy Program itself).
|
||||||
|
|
||||||
|
If, in addition to that:
|
||||||
|
- You give any user an apportunity to interact with Vitastor directly or indirectly through
|
||||||
|
any computer interface including the network or any number of wrappers (Proxy Programs).
|
||||||
|
|
||||||
|
Then VNPL requires you to publish the code of all above Proxy Programs to all above users
|
||||||
|
under the terms of any GPL-compatible license - that is, GPL, LGPL, MIT/BSD or Apache 2,
|
||||||
|
because "GPL compatibility" is treated as an ability to legally include licensed code in
|
||||||
|
a GPL application.
|
||||||
|
|
||||||
|
So, if you have a "Proxy Program", but it's not open to the user who directly or indirectly
|
||||||
|
interacts with Vitastor - you are forbidden to use Vitastor under the terms of VNPL and you
|
||||||
|
need a commercial license which doesn't contain open-source requirements.
|
||||||
|
|
||||||
|
## Examples
|
||||||
|
|
||||||
|
- Vitastor Kubernetes CSI driver which creates PersistentVolumes by calling `vitastor-cli create`.
|
||||||
|
- Yes, it interacts with Vitastor through vitastor-cli.
|
||||||
|
- Yes, it is designed specially for use with Vitastor (it has no sense otherwise).
|
||||||
|
- So, CSI driver **definitely IS** a Proxy Program and must be published under the terms of
|
||||||
|
a free software license.
|
||||||
|
- Windows, installed in a VM with the system disk on Vitastor storage.
|
||||||
|
- Yes, it interacts with Vitastor indirectly - it reads and writes data through the block
|
||||||
|
device interface, emulated by QEMU.
|
||||||
|
- No, it definitely isn't designed specially for use with Vitastor - Windows was created long
|
||||||
|
ago before Vitastor and doesn't know anything about it.
|
||||||
|
- So, Windows **definitely IS NOT** a Proxy Program and VNPL doesn't require to open it.
|
||||||
|
- Cloud control panel which makes requests to Vitastor Kubernetes CSI driver.
|
||||||
|
- Yes, it interacts with Vitastor indirectly through the CSI driver, which is a Proxy Program.
|
||||||
|
- May or may not be designed specially for use with Vitastor. How to determine exactly?
|
||||||
|
Imagine that Vitastor is replaced with any other storage (for example, with a proprietary).
|
||||||
|
Do control panel functions change in any way? If they do (for example, if snapshots stop working),
|
||||||
|
then the panel contains specific functionality and thus is designed specially for use with Vitastor.
|
||||||
|
Otherwise, the panel is universal and isn't designed specially for Vitastor.
|
||||||
|
- So, whether you are required to open-source the panel also **depends** on whether it
|
||||||
|
contains specific functionality or not.
|
||||||
|
|
||||||
|
## Why?
|
||||||
|
|
||||||
|
Because I believe into the spirit of copyleft (Linux wouldn't become so popular without GPL!)
|
||||||
|
and, at the same time, I want to have a way to monetize the product.
|
||||||
|
|
||||||
|
Existing licenses including AGPL are useless for it with an SDS - SDS is a very deeply
|
||||||
|
internal software which is almost definitely invisible to the user and thus AGPL doesn't
|
||||||
|
require anyone to open the code even if they make a proprietary fork.
|
||||||
|
|
||||||
|
And, in fact, the current situation in the world where GPL is though to only restrict direct
|
||||||
|
linking of programs into a single executable file, isn't much correct. Nowadays, programs
|
||||||
|
are more often linked with network API calls, not with /usr/bin/ld, and a software product
|
||||||
|
may consist of dozens of microservices interacting with each other over the network.
|
||||||
|
|
||||||
|
That's why we need VNPL to keep the license sufficiently copyleft.
|
||||||
|
|
||||||
|
## License Texts
|
||||||
|
|
||||||
|
- VNPL 1.1 in English: [VNPL-1.1.txt](../../VNPL-1.1.txt)
|
||||||
|
- VNPL 1.1 in Russian: [VNPL-1.1-RU.txt](../../VNPL-1.1-RU.txt)
|
||||||
|
- GPL 2.0: [GPL-2.0.txt](../../GPL-2.0.txt)
|
||||||
|
|
|
@ -12,6 +12,14 @@
|
||||||
|
|
||||||
Лицензия: VNPL 1.1 на серверный код и двойная VNPL 1.1 + GPL 2.0+ на клиентский.
|
Лицензия: VNPL 1.1 на серверный код и двойная VNPL 1.1 + GPL 2.0+ на клиентский.
|
||||||
|
|
||||||
|
Серверные компоненты распространяются только на условиях VNPL.
|
||||||
|
|
||||||
|
Клиентские библиотеки распространяются на условиях двойной лицензии VNPL 1.0
|
||||||
|
и также на условиях GNU GPL 2.0 или более поздней версии. Так сделано в целях
|
||||||
|
совместимости с таким ПО, как QEMU и fio.
|
||||||
|
|
||||||
|
## VNPL
|
||||||
|
|
||||||
VNPL - "сетевой копилефт", собственная свободная копилефт-лицензия
|
VNPL - "сетевой копилефт", собственная свободная копилефт-лицензия
|
||||||
Vitastor Network Public License 1.1, основанная на GNU GPL 3.0 с дополнительным
|
Vitastor Network Public License 1.1, основанная на GNU GPL 3.0 с дополнительным
|
||||||
условием "Сетевого взаимодействия", требующим распространять все программы,
|
условием "Сетевого взаимодействия", требующим распространять все программы,
|
||||||
|
@ -29,9 +37,70 @@ Vitastor Network Public License 1.1, основанная на GNU GPL 3.0 с д
|
||||||
На Windows и любое другое ПО, не разработанное *специально* для использования
|
На Windows и любое другое ПО, не разработанное *специально* для использования
|
||||||
вместе с Vitastor, никакие ограничения не накладываются.
|
вместе с Vitastor, никакие ограничения не накладываются.
|
||||||
|
|
||||||
Клиентские библиотеки распространяются на условиях двойной лицензии VNPL 1.0
|
## Пояснение
|
||||||
и также на условиях GNU GPL 2.0 или более поздней версии. Так сделано в целях
|
|
||||||
совместимости с таким ПО, как QEMU и fio.
|
|
||||||
|
|
||||||
Вы можете найти полный текст VNPL 1.1 на английском языке в файле [VNPL-1.1.txt](../../VNPL-1.1.txt),
|
Сетевой копилефт регулируется пунктом лицензии **13. Удалённое сетевое взаимодействие**.
|
||||||
VNPL 1.1 на русском языке в файле [VNPL-1.1-RU.txt](../../VNPL-1.1-RU.txt), а GPL 2.0 в файле [GPL-2.0.txt](../../GPL-2.0.txt).
|
|
||||||
|
Программа считается "прокси-программой", если верны оба условия:
|
||||||
|
- Она создана специально для работы вместе с Vitastor. По сути это означает, что программа
|
||||||
|
должна иметь специфичный для Vitastor функционал, то есть, "знать", что она взаимодействует
|
||||||
|
именно с Vitastor.
|
||||||
|
- Она прямо или косвенно взаимодействует с Vitastor через абсолютно любой программный
|
||||||
|
интерфейс, включая любые способы вызова: API, CLI, сеть или через какую-то обёртку (в
|
||||||
|
свою очередь тоже являющуюся прокси-программой).
|
||||||
|
|
||||||
|
Если в дополнение к этому также:
|
||||||
|
- Вы предоставляете любому пользователю возможность взаимодействовать с Vitastor по сети,
|
||||||
|
опять-таки, через любой интерфейс или любую серию "обёрток" (прокси-программ)
|
||||||
|
|
||||||
|
То, согласно VNPL, вы должны открыть код "прокси-программ" **таким пользователям** на условиях
|
||||||
|
любой GPL-совместимой лицензии - то есть, GPL, LGPL, MIT/BSD или Apache 2 - "совместимость с GPL"
|
||||||
|
понимается как возможность включать лицензируемый код в GPL-приложение.
|
||||||
|
|
||||||
|
Соответственно, если у вас есть "прокси-программа", но её код не открыт пользователю,
|
||||||
|
который прямо или косвенно взаимодействует с Vitastor - вам запрещено использовать Vitastor
|
||||||
|
на условиях VNPL и вам нужна коммерческая лицензия, не содержащая требований об открытии кода.
|
||||||
|
|
||||||
|
## Примеры
|
||||||
|
|
||||||
|
- Kubernetes CSI-драйвер Vitastor, создающий PersistentVolume с помощью вызова `vitastor-cli create`.
|
||||||
|
- Да, взаимодействует с Vitastor через vitastor-cli.
|
||||||
|
- Да, создавался специально для работы с Vitastor (иначе в чём же ещё его смысл).
|
||||||
|
- Значит, CSI-драйвер **точно считается** "прокси-программой" и должен быть открыт под свободной
|
||||||
|
лицензией.
|
||||||
|
- Windows, установленный в виртуальную машину на диске Vitastor.
|
||||||
|
- Да, взаимодействует с Vitastor "прямо или косвенно" - пишет и читает данные через интерфейс
|
||||||
|
блочного устройства, эмулируемый QEMU.
|
||||||
|
- Нет, точно не создан *специально для работы с Vitastor* - когда его создавали, никакого
|
||||||
|
Vitastor ещё и в помине не было.
|
||||||
|
- Значит, Windows **точно не считается** "прокси-программой" и на него требования VNPL не распространяются.
|
||||||
|
- Панель управления облака, делающая запросы к Kubernetes CSI-драйверу Vitastor.
|
||||||
|
- Да, взаимодействует с Vitastor косвенно через CSI-драйвер, являющийся "прокси-программой".
|
||||||
|
- Сходу не известно, создавалась ли конкретно для работы с Vitastor. Как понять, да или нет?
|
||||||
|
Представьте, что Vitastor заменён на любую другую систему хранения (например, на проприетарную).
|
||||||
|
Работа панели управления изменится? Если да (например, перестанут работать снапшоты) - значит,
|
||||||
|
панель содержит специфичный функционал и "создана специально для работы с Vitastor".
|
||||||
|
Если нет - значит, специфичного функционала панель не содержит и в принципе она универсальна.
|
||||||
|
- Нужно ли открывать панель - **зависит** от того, содержит она специфичный функционал или нет.
|
||||||
|
|
||||||
|
## Почему так?
|
||||||
|
|
||||||
|
Потому что я одновременно верю в дух копилефт-лицензий (Linux не стал бы так популярен,
|
||||||
|
если бы не GPL!) и хочу иметь возможность монетизации продукта.
|
||||||
|
|
||||||
|
При этом использовать даже AGPL для программной СХД бессмысленно - это глубоко внутреннее
|
||||||
|
ПО, которое пользователь почти наверняка не увидит вообще, поэтому и открывать код никому
|
||||||
|
никогда не придётся, даже при создании производного продукта.
|
||||||
|
|
||||||
|
Да и в целом сложившаяся в мире ситуация, при которой действие GPL ограничивается только
|
||||||
|
прямым связыванием в один исполняемый файл, не очень корректна. В настоящее время программы
|
||||||
|
гораздо чаще интегрируют сетевыми вызовами, а не с помощью /usr/bin/ld, и общий программный
|
||||||
|
продукт может состоять из нескольких десятков микросервисов, взаимодействующих по сети.
|
||||||
|
|
||||||
|
Поэтому для сохранения достаточной "копилефтности" и придумана VNPL.
|
||||||
|
|
||||||
|
## Тексты лицензий
|
||||||
|
|
||||||
|
- VNPL 1.1 на английском языке: [VNPL-1.1.txt](../../VNPL-1.1.txt)
|
||||||
|
- VNPL 1.1 на русском языке: [VNPL-1.1-RU.txt](../../VNPL-1.1-RU.txt)
|
||||||
|
- GPL 2.0: [GPL-2.0.txt](../../GPL-2.0.txt)
|
||||||
|
|
|
@ -31,7 +31,6 @@
|
||||||
#define DEFAULT_DATA_BLOCK_ORDER 17
|
#define DEFAULT_DATA_BLOCK_ORDER 17
|
||||||
#define MIN_DATA_BLOCK_SIZE 4*1024
|
#define MIN_DATA_BLOCK_SIZE 4*1024
|
||||||
#define MAX_DATA_BLOCK_SIZE 128*1024*1024
|
#define MAX_DATA_BLOCK_SIZE 128*1024*1024
|
||||||
#define MAX_META_BLOCK_SIZE 64*1024
|
|
||||||
#define DEFAULT_BITMAP_GRANULARITY 4096
|
#define DEFAULT_BITMAP_GRANULARITY 4096
|
||||||
|
|
||||||
#define BS_OP_MIN 1
|
#define BS_OP_MIN 1
|
||||||
|
|
|
@ -127,9 +127,9 @@ void blockstore_disk_t::parse_config(std::map<std::string, std::string> & config
|
||||||
{
|
{
|
||||||
throw std::runtime_error("meta_block_size must be a multiple of "+std::to_string(DIRECT_IO_ALIGNMENT));
|
throw std::runtime_error("meta_block_size must be a multiple of "+std::to_string(DIRECT_IO_ALIGNMENT));
|
||||||
}
|
}
|
||||||
else if (meta_block_size > MAX_META_BLOCK_SIZE)
|
else if (meta_block_size > MAX_DATA_BLOCK_SIZE)
|
||||||
{
|
{
|
||||||
throw std::runtime_error("meta_block_size must not exceed "+std::to_string(MAX_META_BLOCK_SIZE));
|
throw std::runtime_error("meta_block_size must not exceed "+std::to_string(MAX_DATA_BLOCK_SIZE));
|
||||||
}
|
}
|
||||||
if (data_offset % disk_alignment)
|
if (data_offset % disk_alignment)
|
||||||
{
|
{
|
||||||
|
|
|
@ -427,6 +427,13 @@ stop_flusher:
|
||||||
printf("Flushing %jx:%jx v%ju\n", cur.oid.inode, cur.oid.stripe, cur.version);
|
printf("Flushing %jx:%jx v%ju\n", cur.oid.inode, cur.oid.stripe, cur.version);
|
||||||
#endif
|
#endif
|
||||||
flusher->active_flushers++;
|
flusher->active_flushers++;
|
||||||
|
// Find it in clean_db
|
||||||
|
{
|
||||||
|
auto & clean_db = bs->clean_db_shard(cur.oid);
|
||||||
|
auto clean_it = clean_db.find(cur.oid);
|
||||||
|
old_clean_ver = (clean_it != clean_db.end() ? clean_it->second.version : 0);
|
||||||
|
old_clean_loc = (clean_it != clean_db.end() ? clean_it->second.location : UINT64_MAX);
|
||||||
|
}
|
||||||
// Scan dirty versions of the object to determine what we need to read
|
// Scan dirty versions of the object to determine what we need to read
|
||||||
scan_dirty();
|
scan_dirty();
|
||||||
// Writes and deletes shouldn't happen at the same time
|
// Writes and deletes shouldn't happen at the same time
|
||||||
|
@ -531,7 +538,7 @@ resume_2:
|
||||||
{
|
{
|
||||||
// zero out old metadata entry
|
// zero out old metadata entry
|
||||||
{
|
{
|
||||||
clean_disk_entry *old_entry = (clean_disk_entry*)((uint8_t*)meta_old.buf + meta_old.pos);
|
clean_disk_entry *old_entry = (clean_disk_entry*)((uint8_t*)meta_old.buf + meta_old.pos*bs->dsk.clean_entry_size);
|
||||||
if (old_entry->oid.inode != 0 && old_entry->oid != cur.oid)
|
if (old_entry->oid.inode != 0 && old_entry->oid != cur.oid)
|
||||||
{
|
{
|
||||||
printf("Fatal error (metadata corruption or bug): tried to wipe metadata entry %ju (%jx:%jx v%ju) as old location of %jx:%jx\n",
|
printf("Fatal error (metadata corruption or bug): tried to wipe metadata entry %ju (%jx:%jx v%ju) as old location of %jx:%jx\n",
|
||||||
|
@ -540,7 +547,7 @@ resume_2:
|
||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
memset((uint8_t*)meta_old.buf + meta_old.pos, 0, bs->dsk.clean_entry_size);
|
memset((uint8_t*)meta_old.buf + meta_old.pos*bs->dsk.clean_entry_size, 0, bs->dsk.clean_entry_size);
|
||||||
resume_20:
|
resume_20:
|
||||||
if (meta_old.sector != meta_new.sector && !write_meta_block(meta_old, 20))
|
if (meta_old.sector != meta_new.sector && !write_meta_block(meta_old, 20))
|
||||||
return false;
|
return false;
|
||||||
|
@ -601,7 +608,7 @@ resume_2:
|
||||||
|
|
||||||
void journal_flusher_co::update_metadata_entry()
|
void journal_flusher_co::update_metadata_entry()
|
||||||
{
|
{
|
||||||
clean_disk_entry *new_entry = (clean_disk_entry*)((uint8_t*)meta_new.buf + meta_new.pos);
|
clean_disk_entry *new_entry = (clean_disk_entry*)((uint8_t*)meta_new.buf + meta_new.pos*bs->dsk.clean_entry_size);
|
||||||
if (new_entry->oid.inode != 0 && new_entry->oid != cur.oid)
|
if (new_entry->oid.inode != 0 && new_entry->oid != cur.oid)
|
||||||
{
|
{
|
||||||
printf(
|
printf(
|
||||||
|
@ -616,7 +623,7 @@ void journal_flusher_co::update_metadata_entry()
|
||||||
if (has_delete)
|
if (has_delete)
|
||||||
{
|
{
|
||||||
// Zero out the new metadata entry
|
// Zero out the new metadata entry
|
||||||
memset((uint8_t*)meta_new.buf + meta_new.pos, 0, bs->dsk.clean_entry_size);
|
memset((uint8_t*)meta_new.buf + meta_new.pos*bs->dsk.clean_entry_size, 0, bs->dsk.clean_entry_size);
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
|
@ -798,7 +805,7 @@ bool journal_flusher_co::clear_incomplete_csum_block_bits(int wait_base)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
clean_disk_entry *new_entry = (clean_disk_entry*)((uint8_t*)meta_new.buf + meta_new.pos);
|
clean_disk_entry *new_entry = (clean_disk_entry*)((uint8_t*)meta_new.buf + meta_new.pos*bs->dsk.clean_entry_size);
|
||||||
if (new_entry->oid != cur.oid)
|
if (new_entry->oid != cur.oid)
|
||||||
{
|
{
|
||||||
printf(
|
printf(
|
||||||
|
@ -905,12 +912,6 @@ void journal_flusher_co::calc_block_checksums(uint32_t *new_data_csums, bool ski
|
||||||
|
|
||||||
void journal_flusher_co::scan_dirty()
|
void journal_flusher_co::scan_dirty()
|
||||||
{
|
{
|
||||||
// Find it in clean_db
|
|
||||||
auto & clean_db = bs->clean_db_shard(cur.oid);
|
|
||||||
auto clean_it = clean_db.find(cur.oid);
|
|
||||||
old_clean_ver = (clean_it != clean_db.end() ? clean_it->second.version : 0);
|
|
||||||
old_clean_loc = (clean_it != clean_db.end() ? clean_it->second.location : UINT64_MAX);
|
|
||||||
auto old_clean_bitmap = (clean_it != clean_db.end() ? bs->get_clean_entry_bitmap(clean_it, 0) : NULL);
|
|
||||||
dirty_it = dirty_start = dirty_end;
|
dirty_it = dirty_start = dirty_end;
|
||||||
v.clear();
|
v.clear();
|
||||||
copy_count = 0;
|
copy_count = 0;
|
||||||
|
@ -1036,12 +1037,13 @@ void journal_flusher_co::scan_dirty()
|
||||||
read_to_fill_incomplete = 0;
|
read_to_fill_incomplete = 0;
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
uint8_t *bmp_ptr = bs->get_clean_entry_bitmap(old_clean_loc, 0);
|
||||||
uint64_t fulfilled = 0;
|
uint64_t fulfilled = 0;
|
||||||
int last = v.size()-1;
|
int last = v.size()-1;
|
||||||
while (last >= 0 && (v[last].copy_flags & COPY_BUF_CSUM_FILL))
|
while (last >= 0 && (v[last].copy_flags & COPY_BUF_CSUM_FILL))
|
||||||
last--;
|
last--;
|
||||||
read_to_fill_incomplete = bs->fill_partial_checksum_blocks(
|
read_to_fill_incomplete = bs->fill_partial_checksum_blocks(
|
||||||
v, fulfilled, old_clean_bitmap, NULL, false, NULL, v[0].offset/bs->dsk.csum_block_size * bs->dsk.csum_block_size,
|
v, fulfilled, bmp_ptr, NULL, false, NULL, v[0].offset/bs->dsk.csum_block_size * bs->dsk.csum_block_size,
|
||||||
((v[last].offset+v[last].len-1) / bs->dsk.csum_block_size + 1) * bs->dsk.csum_block_size
|
((v[last].offset+v[last].len-1) / bs->dsk.csum_block_size + 1) * bs->dsk.csum_block_size
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
@ -1137,7 +1139,7 @@ bool journal_flusher_co::modify_meta_do_reads(int wait_base)
|
||||||
resume_0:
|
resume_0:
|
||||||
if (!modify_meta_read(clean_loc, meta_new, wait_base+0))
|
if (!modify_meta_read(clean_loc, meta_new, wait_base+0))
|
||||||
return false;
|
return false;
|
||||||
new_clean_bitmap = (uint8_t*)meta_new.buf + meta_new.pos + sizeof(clean_disk_entry);
|
new_clean_bitmap = (uint8_t*)meta_new.buf + meta_new.pos*bs->dsk.clean_entry_size + sizeof(clean_disk_entry);
|
||||||
if (old_clean_loc != UINT64_MAX && old_clean_loc != clean_loc)
|
if (old_clean_loc != UINT64_MAX && old_clean_loc != clean_loc)
|
||||||
{
|
{
|
||||||
resume_1:
|
resume_1:
|
||||||
|
@ -1191,7 +1193,7 @@ bool journal_flusher_co::modify_meta_read(uint64_t meta_loc, flusher_meta_write_
|
||||||
// so I'll avoid it as long as I can.
|
// so I'll avoid it as long as I can.
|
||||||
wr.submitted = false;
|
wr.submitted = false;
|
||||||
wr.sector = ((meta_loc >> bs->dsk.block_order) / (bs->dsk.meta_block_size / bs->dsk.clean_entry_size)) * bs->dsk.meta_block_size;
|
wr.sector = ((meta_loc >> bs->dsk.block_order) / (bs->dsk.meta_block_size / bs->dsk.clean_entry_size)) * bs->dsk.meta_block_size;
|
||||||
wr.pos = ((meta_loc >> bs->dsk.block_order) % (bs->dsk.meta_block_size / bs->dsk.clean_entry_size)) * bs->dsk.clean_entry_size;
|
wr.pos = ((meta_loc >> bs->dsk.block_order) % (bs->dsk.meta_block_size / bs->dsk.clean_entry_size));
|
||||||
if (bs->inmemory_meta)
|
if (bs->inmemory_meta)
|
||||||
{
|
{
|
||||||
wr.buf = (uint8_t*)bs->metadata_buffer + wr.sector;
|
wr.buf = (uint8_t*)bs->metadata_buffer + wr.sector;
|
||||||
|
|
|
@ -42,8 +42,6 @@ blockstore_impl_t::~blockstore_impl_t()
|
||||||
free(metadata_buffer);
|
free(metadata_buffer);
|
||||||
if (clean_bitmaps)
|
if (clean_bitmaps)
|
||||||
free(clean_bitmaps);
|
free(clean_bitmaps);
|
||||||
if (heap_meta.blocks)
|
|
||||||
delete[] heap_meta.blocks;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
bool blockstore_impl_t::is_started()
|
bool blockstore_impl_t::is_started()
|
||||||
|
@ -433,29 +431,13 @@ blockstore_clean_db_t& blockstore_impl_t::clean_db_shard(object_id oid)
|
||||||
{
|
{
|
||||||
uint64_t pg_num = 0;
|
uint64_t pg_num = 0;
|
||||||
uint64_t pool_id = (oid.inode >> (64-POOL_ID_BITS));
|
uint64_t pool_id = (oid.inode >> (64-POOL_ID_BITS));
|
||||||
auto sett_it = clean_db_settings.find(pool_id);
|
auto sh_it = clean_db_settings.find(pool_id);
|
||||||
if (sett_it != clean_db_settings.end())
|
if (sh_it != clean_db_settings.end())
|
||||||
{
|
{
|
||||||
// like map_to_pg()
|
// like map_to_pg()
|
||||||
pg_num = (oid.stripe / sett_it->second.pg_stripe_size) % sett_it->second.pg_count + 1;
|
pg_num = (oid.stripe / sh_it->second.pg_stripe_size) % sh_it->second.pg_count + 1;
|
||||||
}
|
}
|
||||||
auto shard_id = (pool_id << (64-POOL_ID_BITS)) | pg_num;
|
return clean_db_shards[(pool_id << (64-POOL_ID_BITS)) | pg_num];
|
||||||
if (dsk.meta_format == BLOCKSTORE_META_FORMAT_HEAP)
|
|
||||||
{
|
|
||||||
auto sh_it = clean_db_shards.find(shard_id);
|
|
||||||
if (sh_it == clean_db_shards.end())
|
|
||||||
{
|
|
||||||
// clean_db_t stores larger entries with heap_meta, but we disguise it as smaller clean_entry :)
|
|
||||||
// patched cpp-btree with extra_data
|
|
||||||
clean_db_shards[shard_id] = blockstore_clean_db_t(
|
|
||||||
sizeof(clean_entry_heap_t) - sizeof(clean_entry)
|
|
||||||
+ (inmemory_meta ? dsk.clean_dyn_size : 2*dsk.clean_entry_bitmap_size)
|
|
||||||
);
|
|
||||||
return clean_db_shards[shard_id];
|
|
||||||
}
|
|
||||||
return sh_it->second;
|
|
||||||
}
|
|
||||||
return clean_db_shards[shard_id];
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void blockstore_impl_t::reshard_clean_db(pool_id_t pool, uint32_t pg_count, uint32_t pg_stripe_size)
|
void blockstore_impl_t::reshard_clean_db(pool_id_t pool, uint32_t pg_count, uint32_t pg_stripe_size)
|
||||||
|
|
|
@ -96,9 +96,6 @@
|
||||||
#define BLOCKSTORE_META_MAGIC_V1 0x726F747341544956l
|
#define BLOCKSTORE_META_MAGIC_V1 0x726F747341544956l
|
||||||
#define BLOCKSTORE_META_FORMAT_V1 1
|
#define BLOCKSTORE_META_FORMAT_V1 1
|
||||||
#define BLOCKSTORE_META_FORMAT_V2 2
|
#define BLOCKSTORE_META_FORMAT_V2 2
|
||||||
#define BLOCKSTORE_META_FORMAT_HEAP 3
|
|
||||||
#define BLOCKSTORE_META_HEADER_V1_SIZE 36
|
|
||||||
#define BLOCKSTORE_META_HEADER_V2_SIZE 48
|
|
||||||
|
|
||||||
// metadata header (superblock)
|
// metadata header (superblock)
|
||||||
struct __attribute__((__packed__)) blockstore_meta_header_v1_t
|
struct __attribute__((__packed__)) blockstore_meta_header_v1_t
|
||||||
|
@ -122,7 +119,6 @@ struct __attribute__((__packed__)) blockstore_meta_header_v2_t
|
||||||
uint32_t data_csum_type;
|
uint32_t data_csum_type;
|
||||||
uint32_t csum_block_size;
|
uint32_t csum_block_size;
|
||||||
uint32_t header_csum;
|
uint32_t header_csum;
|
||||||
uint32_t block_id_bits; // 32 by default in heap meta
|
|
||||||
};
|
};
|
||||||
|
|
||||||
// 32 bytes = 24 bytes + block bitmap (4 bytes by default) + external attributes (also bitmap, 4 bytes by default)
|
// 32 bytes = 24 bytes + block bitmap (4 bytes by default) + external attributes (also bitmap, 4 bytes by default)
|
||||||
|
@ -144,62 +140,6 @@ struct __attribute__((__packed__)) clean_entry
|
||||||
uint64_t location;
|
uint64_t location;
|
||||||
};
|
};
|
||||||
|
|
||||||
typedef uint32_t heap_block_num_t;
|
|
||||||
|
|
||||||
// 50 = 16 (key=object_id) + 26 (value) + 8 (bitmap) + N (checksum) bytes per "clean" entry in memory
|
|
||||||
struct __attribute__((__packed__)) clean_entry_heap_t
|
|
||||||
{
|
|
||||||
uint64_t version;
|
|
||||||
uint64_t location; // UINT64_MAX = deleted
|
|
||||||
// previous versions invalidated by this version
|
|
||||||
heap_block_num_t prev_versions;
|
|
||||||
// metadata block number
|
|
||||||
heap_block_num_t meta_block;
|
|
||||||
// offset within block
|
|
||||||
uint16_t block_offset;
|
|
||||||
uint8_t bitmap[];
|
|
||||||
};
|
|
||||||
|
|
||||||
struct __attribute__((__packed__)) heap_meta_block_header_t
|
|
||||||
{
|
|
||||||
uint64_t magic;
|
|
||||||
uint64_t seq_num;
|
|
||||||
uint32_t invalidates_blocks;
|
|
||||||
};
|
|
||||||
|
|
||||||
// 48+checksums = (40+bitmap)+checksums bytes per on-disk "heap" entry
|
|
||||||
// for 128 KB block without checksums, it's 48 bytes - 84 entries per 4 kb metadata block
|
|
||||||
// for 128 KB block with 4k checksums, it's 176 bytes - 22 entries per 4 kb metadata block
|
|
||||||
// for 1 MB block without checksums, it's 80 bytes - 50 entries per 4 kb metadata block
|
|
||||||
// for 1 MB block with 4k checksums, it's 1104 bytes O_o - only 3 entries per 4 kb metadata block
|
|
||||||
// for 1 MB block with 32k checksums, it's 176 bytes again
|
|
||||||
struct __attribute__((__packed__)) heap_meta_entry_t
|
|
||||||
{
|
|
||||||
object_id oid;
|
|
||||||
uint64_t version;
|
|
||||||
uint64_t location; // UINT64_MAX = deleted
|
|
||||||
uint64_t reserved;
|
|
||||||
uint8_t bitmap[];
|
|
||||||
};
|
|
||||||
|
|
||||||
struct heap_meta_block_t
|
|
||||||
{
|
|
||||||
heap_block_num_t offset = 0;
|
|
||||||
uint64_t seq_num = 0;
|
|
||||||
uint32_t used_space = 0;
|
|
||||||
std::vector<uint64_t> invalidates_blocks;
|
|
||||||
};
|
|
||||||
|
|
||||||
struct heap_meta_t
|
|
||||||
{
|
|
||||||
heap_block_num_t block_count = 0;
|
|
||||||
heap_meta_block_t *blocks = NULL;
|
|
||||||
// used space => block number
|
|
||||||
std::multimap<uint32_t, heap_block_num_t> used_space_map;
|
|
||||||
heap_block_num_t cur_written_block = 0;
|
|
||||||
uint8_t *written_block_buf = NULL;
|
|
||||||
};
|
|
||||||
|
|
||||||
// 64 = 24 + 40 bytes per dirty entry in memory (obj_ver_id => dirty_entry). Plus checksums
|
// 64 = 24 + 40 bytes per dirty entry in memory (obj_ver_id => dirty_entry). Plus checksums
|
||||||
struct __attribute__((__packed__)) dirty_entry
|
struct __attribute__((__packed__)) dirty_entry
|
||||||
{
|
{
|
||||||
|
@ -332,8 +272,6 @@ class blockstore_impl_t
|
||||||
|
|
||||||
struct ring_consumer_t ring_consumer;
|
struct ring_consumer_t ring_consumer;
|
||||||
|
|
||||||
heap_meta_t heap_meta;
|
|
||||||
|
|
||||||
std::map<pool_id_t, pool_shard_settings_t> clean_db_settings;
|
std::map<pool_id_t, pool_shard_settings_t> clean_db_settings;
|
||||||
std::map<pool_pg_id_t, blockstore_clean_db_t> clean_db_shards;
|
std::map<pool_pg_id_t, blockstore_clean_db_t> clean_db_shards;
|
||||||
std::map<uint64_t, int> no_inode_stats;
|
std::map<uint64_t, int> no_inode_stats;
|
||||||
|
@ -379,7 +317,7 @@ class blockstore_impl_t
|
||||||
void open_data();
|
void open_data();
|
||||||
void open_meta();
|
void open_meta();
|
||||||
void open_journal();
|
void open_journal();
|
||||||
uint8_t* get_clean_entry_bitmap(blockstore_clean_db_t::iterator clean_it, int offset);
|
uint8_t* get_clean_entry_bitmap(uint64_t block_loc, int offset);
|
||||||
|
|
||||||
blockstore_clean_db_t& clean_db_shard(object_id oid);
|
blockstore_clean_db_t& clean_db_shard(object_id oid);
|
||||||
void reshard_clean_db(pool_id_t pool_id, uint32_t pg_count, uint32_t pg_stripe_size);
|
void reshard_clean_db(pool_id_t pool_id, uint32_t pg_count, uint32_t pg_stripe_size);
|
||||||
|
@ -407,9 +345,9 @@ class blockstore_impl_t
|
||||||
uint64_t &fulfilled, uint32_t item_start, uint32_t item_end,
|
uint64_t &fulfilled, uint32_t item_start, uint32_t item_end,
|
||||||
uint32_t item_state, uint64_t item_version, uint64_t item_location,
|
uint32_t item_state, uint64_t item_version, uint64_t item_location,
|
||||||
uint64_t journal_sector, uint8_t *csum, int *dyn_data);
|
uint64_t journal_sector, uint8_t *csum, int *dyn_data);
|
||||||
bool fulfill_clean_read_journal(blockstore_op_t *read_op, uint64_t & fulfilled,
|
bool fulfill_clean_read(blockstore_op_t *read_op, uint64_t & fulfilled,
|
||||||
uint8_t *clean_entry_bitmap, int *dyn_data, uint32_t item_start, uint32_t item_end, uint64_t clean_loc, uint64_t clean_ver);
|
uint8_t *clean_entry_bitmap, int *dyn_data,
|
||||||
bool fulfill_clean_read_meta(blockstore_op_t *read_op, uint64_t & fulfilled, blockstore_clean_db_t::iterator clean_it);
|
uint32_t item_start, uint32_t item_end, uint64_t clean_loc, uint64_t clean_ver);
|
||||||
int fill_partial_checksum_blocks(std::vector<copy_buffer_t> & rv, uint64_t & fulfilled,
|
int fill_partial_checksum_blocks(std::vector<copy_buffer_t> & rv, uint64_t & fulfilled,
|
||||||
uint8_t *clean_entry_bitmap, int *dyn_data, bool from_journal, uint8_t *read_buf, uint64_t read_offset, uint64_t read_end);
|
uint8_t *clean_entry_bitmap, int *dyn_data, bool from_journal, uint8_t *read_buf, uint64_t read_offset, uint64_t read_end);
|
||||||
int pad_journal_read(std::vector<copy_buffer_t> & rv, copy_buffer_t & cp,
|
int pad_journal_read(std::vector<copy_buffer_t> & rv, copy_buffer_t & cp,
|
||||||
|
@ -418,7 +356,7 @@ class blockstore_impl_t
|
||||||
bool read_range_fulfilled(std::vector<copy_buffer_t> & rv, uint64_t & fulfilled, uint8_t *read_buf,
|
bool read_range_fulfilled(std::vector<copy_buffer_t> & rv, uint64_t & fulfilled, uint8_t *read_buf,
|
||||||
uint8_t *clean_entry_bitmap, uint32_t item_start, uint32_t item_end);
|
uint8_t *clean_entry_bitmap, uint32_t item_start, uint32_t item_end);
|
||||||
bool read_checksum_block(blockstore_op_t *op, int rv_pos, uint64_t &fulfilled, uint64_t clean_loc);
|
bool read_checksum_block(blockstore_op_t *op, int rv_pos, uint64_t &fulfilled, uint64_t clean_loc);
|
||||||
uint8_t* read_clean_meta_block(blockstore_op_t *op, blockstore_clean_db_t::iterator clean_it, int rv_pos);
|
uint8_t* read_clean_meta_block(blockstore_op_t *read_op, uint64_t clean_loc, int rv_pos);
|
||||||
bool verify_padded_checksums(uint8_t *clean_entry_bitmap, uint8_t *csum_buf, uint32_t offset,
|
bool verify_padded_checksums(uint8_t *clean_entry_bitmap, uint8_t *csum_buf, uint32_t offset,
|
||||||
iovec *iov, int n_iov, std::function<void(uint32_t, uint32_t, uint32_t)> bad_block_cb);
|
iovec *iov, int n_iov, std::function<void(uint32_t, uint32_t, uint32_t)> bad_block_cb);
|
||||||
bool verify_journal_checksums(uint8_t *csums, uint32_t offset,
|
bool verify_journal_checksums(uint8_t *csums, uint32_t offset,
|
||||||
|
|
|
@ -54,7 +54,6 @@ int blockstore_init_meta::loop()
|
||||||
else if (wait_state == 4) goto resume_4;
|
else if (wait_state == 4) goto resume_4;
|
||||||
else if (wait_state == 5) goto resume_5;
|
else if (wait_state == 5) goto resume_5;
|
||||||
else if (wait_state == 6) goto resume_6;
|
else if (wait_state == 6) goto resume_6;
|
||||||
else if (wait_state == 7) goto resume_7;
|
|
||||||
printf("Reading blockstore metadata\n");
|
printf("Reading blockstore metadata\n");
|
||||||
if (bs->inmemory_meta)
|
if (bs->inmemory_meta)
|
||||||
metadata_buffer = bs->metadata_buffer;
|
metadata_buffer = bs->metadata_buffer;
|
||||||
|
@ -79,7 +78,6 @@ resume_1:
|
||||||
if (iszero((uint64_t*)metadata_buffer, bs->dsk.meta_block_size / sizeof(uint64_t)))
|
if (iszero((uint64_t*)metadata_buffer, bs->dsk.meta_block_size / sizeof(uint64_t)))
|
||||||
{
|
{
|
||||||
{
|
{
|
||||||
memset(metadata_buffer, 0, bs->dsk.meta_block_size);
|
|
||||||
blockstore_meta_header_v2_t *hdr = (blockstore_meta_header_v2_t *)metadata_buffer;
|
blockstore_meta_header_v2_t *hdr = (blockstore_meta_header_v2_t *)metadata_buffer;
|
||||||
hdr->zero = 0;
|
hdr->zero = 0;
|
||||||
hdr->magic = BLOCKSTORE_META_MAGIC_V1;
|
hdr->magic = BLOCKSTORE_META_MAGIC_V1;
|
||||||
|
@ -87,19 +85,12 @@ resume_1:
|
||||||
hdr->meta_block_size = bs->dsk.meta_block_size;
|
hdr->meta_block_size = bs->dsk.meta_block_size;
|
||||||
hdr->data_block_size = bs->dsk.data_block_size;
|
hdr->data_block_size = bs->dsk.data_block_size;
|
||||||
hdr->bitmap_granularity = bs->dsk.bitmap_granularity;
|
hdr->bitmap_granularity = bs->dsk.bitmap_granularity;
|
||||||
if (bs->dsk.meta_format >= BLOCKSTORE_META_FORMAT_HEAP)
|
|
||||||
{
|
|
||||||
hdr->block_id_bits = sizeof(heap_block_num_t);
|
|
||||||
}
|
|
||||||
if (bs->dsk.meta_format >= BLOCKSTORE_META_FORMAT_V2)
|
if (bs->dsk.meta_format >= BLOCKSTORE_META_FORMAT_V2)
|
||||||
{
|
{
|
||||||
hdr->data_csum_type = bs->dsk.data_csum_type;
|
hdr->data_csum_type = bs->dsk.data_csum_type;
|
||||||
hdr->csum_block_size = bs->dsk.csum_block_size;
|
hdr->csum_block_size = bs->dsk.csum_block_size;
|
||||||
hdr->header_csum = 0;
|
hdr->header_csum = 0;
|
||||||
hdr->header_csum = crc32c(0, hdr,
|
hdr->header_csum = crc32c(0, hdr, sizeof(*hdr));
|
||||||
bs->dsk.meta_format == BLOCKSTORE_META_FORMAT_V2
|
|
||||||
? BLOCKSTORE_META_HEADER_V2_SIZE
|
|
||||||
: sizeof(*hdr));
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (bs->readonly)
|
if (bs->readonly)
|
||||||
|
@ -137,7 +128,7 @@ resume_1:
|
||||||
);
|
);
|
||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
if (hdr->version == BLOCKSTORE_META_FORMAT_HEAP)
|
if (hdr->version == BLOCKSTORE_META_FORMAT_V2)
|
||||||
{
|
{
|
||||||
uint32_t csum = hdr->header_csum;
|
uint32_t csum = hdr->header_csum;
|
||||||
hdr->header_csum = 0;
|
hdr->header_csum = 0;
|
||||||
|
@ -147,23 +138,6 @@ resume_1:
|
||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
hdr->header_csum = csum;
|
hdr->header_csum = csum;
|
||||||
bs->dsk.meta_format = BLOCKSTORE_META_FORMAT_HEAP;
|
|
||||||
if (hdr->block_id_bits != sizeof(heap_block_num_t))
|
|
||||||
{
|
|
||||||
printf("Heap metadata block ID size (%u) is not supported by this build\n", hdr->block_id_bits);
|
|
||||||
exit(1);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else if (hdr->version == BLOCKSTORE_META_FORMAT_V2)
|
|
||||||
{
|
|
||||||
uint32_t csum = hdr->header_csum;
|
|
||||||
hdr->header_csum = 0;
|
|
||||||
if (crc32c(0, hdr, BLOCKSTORE_META_HEADER_V2_SIZE) != csum)
|
|
||||||
{
|
|
||||||
printf("Metadata header is corrupt (checksum mismatch).\n");
|
|
||||||
exit(1);
|
|
||||||
}
|
|
||||||
hdr->header_csum = csum;
|
|
||||||
if (bs->dsk.meta_format != BLOCKSTORE_META_FORMAT_V2)
|
if (bs->dsk.meta_format != BLOCKSTORE_META_FORMAT_V2)
|
||||||
{
|
{
|
||||||
bs->dsk.meta_format = BLOCKSTORE_META_FORMAT_V2;
|
bs->dsk.meta_format = BLOCKSTORE_META_FORMAT_V2;
|
||||||
|
@ -186,11 +160,11 @@ resume_1:
|
||||||
printf("Warning: Starting with metadata in the old format without checksums, as stored on disk\n");
|
printf("Warning: Starting with metadata in the old format without checksums, as stored on disk\n");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else
|
else if (hdr->version > BLOCKSTORE_META_FORMAT_V2)
|
||||||
{
|
{
|
||||||
printf(
|
printf(
|
||||||
"Metadata format is too new for me (stored version is %ju, max supported %u).\n",
|
"Metadata format is too new for me (stored version is %ju, max supported %u).\n",
|
||||||
hdr->version, BLOCKSTORE_META_FORMAT_HEAP
|
hdr->version, BLOCKSTORE_META_FORMAT_V2
|
||||||
);
|
);
|
||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
|
@ -215,12 +189,7 @@ resume_1:
|
||||||
// Skip superblock
|
// Skip superblock
|
||||||
md_offset = bs->dsk.meta_block_size;
|
md_offset = bs->dsk.meta_block_size;
|
||||||
next_offset = md_offset;
|
next_offset = md_offset;
|
||||||
entries_per_block = bs->dsk.meta_block_size / bs->dsk.clean_entry_size; // FIXME only array
|
entries_per_block = bs->dsk.meta_block_size / bs->dsk.clean_entry_size;
|
||||||
if (bs->dsk.meta_format == BLOCKSTORE_META_FORMAT_HEAP)
|
|
||||||
{
|
|
||||||
bs->heap_meta.blocks = new heap_meta_block_t[bs->dsk.meta_len / bs->dsk.meta_block_size];
|
|
||||||
bs->heap_meta.block_count = bs->dsk.meta_len / bs->dsk.meta_block_size;
|
|
||||||
}
|
|
||||||
// Read the rest of the metadata
|
// Read the rest of the metadata
|
||||||
resume_2:
|
resume_2:
|
||||||
if (next_offset < bs->dsk.meta_len && submitted == 0)
|
if (next_offset < bs->dsk.meta_len && submitted == 0)
|
||||||
|
@ -264,10 +233,9 @@ resume_2:
|
||||||
bool changed = false;
|
bool changed = false;
|
||||||
for (uint64_t sector = 0; sector < bufs[i].size; sector += bs->dsk.meta_block_size)
|
for (uint64_t sector = 0; sector < bufs[i].size; sector += bs->dsk.meta_block_size)
|
||||||
{
|
{
|
||||||
auto this_changed = bs->dsk.meta_format == BLOCKSTORE_META_FORMAT_HEAP
|
// handle <count> entries
|
||||||
? handle_heap_meta_block(bufs[i].buf + sector, bufs[i].offset + sector - md_offset)
|
if (handle_meta_block(bufs[i].buf + sector, entries_per_block,
|
||||||
: handle_array_meta_block(bufs[i].buf + sector, bufs[i].offset + sector - md_offset);
|
((bufs[i].offset + sector - md_offset) / bs->dsk.meta_block_size) * entries_per_block))
|
||||||
if (this_changed)
|
|
||||||
changed = true;
|
changed = true;
|
||||||
}
|
}
|
||||||
if (changed && !bs->inmemory_meta && !bs->readonly)
|
if (changed && !bs->inmemory_meta && !bs->readonly)
|
||||||
|
@ -294,41 +262,6 @@ resume_2:
|
||||||
wait_state = 2;
|
wait_state = 2;
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
if (bs->dsk.meta_format == BLOCKSTORE_META_FORMAT_HEAP)
|
|
||||||
{
|
|
||||||
// build used_space index
|
|
||||||
for (heap_block_num_t i = 0; i < bs->heap_meta.block_count; i++)
|
|
||||||
{
|
|
||||||
bs->heap_meta.used_space_map.emplace(std::pair<uint32_t, heap_block_num_t>(bs->heap_meta.blocks[i].used_space, i));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (heap_invalidated_block_seq.size() && !bs->readonly)
|
|
||||||
{
|
|
||||||
// zero out invalidated blocks not zeroed during the previous OSD execution
|
|
||||||
for (auto inv_seq: heap_invalidated_block_seq)
|
|
||||||
{
|
|
||||||
auto num_it = heap_block_by_seq.find(inv_seq);
|
|
||||||
if (num_it != heap_block_by_seq.end())
|
|
||||||
heap_invalidated_block_nums.push_back(num_it->second);
|
|
||||||
}
|
|
||||||
memset(metadata_buffer, 0, bs->dsk.meta_block_size);
|
|
||||||
for (i = 0; i < heap_invalidated_block_nums.size(); i++)
|
|
||||||
{
|
|
||||||
GET_SQE();
|
|
||||||
last_read_offset = heap_invalidated_block_nums[i]*bs->dsk.meta_block_size;
|
|
||||||
data->iov = { metadata_buffer, (size_t)bs->dsk.meta_block_size };
|
|
||||||
data->callback = [this](ring_data_t *data) { handle_event(data, -1); };
|
|
||||||
my_uring_prep_writev(sqe, bs->dsk.meta_fd, &data->iov, 1, bs->dsk.meta_offset + last_read_offset);
|
|
||||||
bs->ringloop->submit();
|
|
||||||
submitted++;
|
|
||||||
resume_7:
|
|
||||||
if (submitted > 0)
|
|
||||||
{
|
|
||||||
wait_state = 7;
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (entries_to_zero.size() && !bs->inmemory_meta && !bs->readonly)
|
if (entries_to_zero.size() && !bs->inmemory_meta && !bs->readonly)
|
||||||
{
|
{
|
||||||
std::sort(entries_to_zero.begin(), entries_to_zero.end());
|
std::sort(entries_to_zero.begin(), entries_to_zero.end());
|
||||||
|
@ -396,9 +329,8 @@ resume_6:
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool blockstore_init_meta::handle_array_meta_block(uint8_t *buf, uint64_t block_offset)
|
bool blockstore_init_meta::handle_meta_block(uint8_t *buf, uint64_t entries_per_block, uint64_t done_cnt)
|
||||||
{
|
{
|
||||||
uint64_t done_cnt = (block_offset / bs->dsk.meta_block_size) * entries_per_block;
|
|
||||||
bool updated = false;
|
bool updated = false;
|
||||||
uint64_t max_i = entries_per_block;
|
uint64_t max_i = entries_per_block;
|
||||||
if (max_i > bs->dsk.block_count-done_cnt)
|
if (max_i > bs->dsk.block_count-done_cnt)
|
||||||
|
@ -497,132 +429,6 @@ bool blockstore_init_meta::handle_array_meta_block(uint8_t *buf, uint64_t block_
|
||||||
return updated;
|
return updated;
|
||||||
}
|
}
|
||||||
|
|
||||||
static int bitmap_count_ones(uint8_t *bitmap, int size)
|
|
||||||
{
|
|
||||||
int n = 0, i = 0;
|
|
||||||
for (; i <= size-sizeof(unsigned); i += sizeof(unsigned))
|
|
||||||
{
|
|
||||||
n += __builtin_popcount(*(unsigned*)(bitmap+i));
|
|
||||||
}
|
|
||||||
for (; i < size; i++)
|
|
||||||
{
|
|
||||||
n += __builtin_popcount(*(unsigned char*)(bitmap+i));
|
|
||||||
}
|
|
||||||
return n;
|
|
||||||
}
|
|
||||||
|
|
||||||
// v3 / heap / "cow" metadata block
|
|
||||||
bool blockstore_init_meta::handle_heap_meta_block(uint8_t *buf, uint64_t block_offset)
|
|
||||||
{
|
|
||||||
if ((block_offset / bs->dsk.meta_block_size) > (heap_block_num_t)-1)
|
|
||||||
{
|
|
||||||
fprintf(stderr, "Metadata area too large\n");
|
|
||||||
exit(1);
|
|
||||||
}
|
|
||||||
// Validate block CRC
|
|
||||||
uint32_t block_crc = *(uint32_t*)(buf + bs->dsk.meta_block_size - 4);
|
|
||||||
if (crc32c(0, buf, bs->dsk.meta_block_size-4) != block_crc)
|
|
||||||
{
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
// Validate header
|
|
||||||
heap_meta_block_header_t *hdr = (heap_meta_block_header_t*)buf;
|
|
||||||
if (hdr->magic != BLOCKSTORE_META_MAGIC_V1)
|
|
||||||
{
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
if (hdr->invalidates_blocks > (bs->dsk.meta_block_size-4-sizeof(heap_meta_block_header_t))/sizeof(uint64_t))
|
|
||||||
{
|
|
||||||
fprintf(stderr, "Metadata block at %jx contains too large invalidates_blocks count: %x\n", block_offset, hdr->invalidates_blocks);
|
|
||||||
exit(1);
|
|
||||||
}
|
|
||||||
if (heap_invalidated_block_seq.find(hdr->seq_num) != heap_invalidated_block_seq.end())
|
|
||||||
{
|
|
||||||
// Check if the block is invalidated and handled after the block that invalidates it
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
uint64_t hdr_size = sizeof(heap_meta_block_header_t) + hdr->invalidates_blocks*8;
|
|
||||||
heap_meta_block_t & blk = bs->heap_meta.blocks[block_offset/bs->dsk.meta_block_size];
|
|
||||||
blk.offset = block_offset;
|
|
||||||
blk.seq_num = hdr->seq_num;
|
|
||||||
blk.used_space = hdr_size + 4;
|
|
||||||
uint64_t *hdr_inv = (uint64_t*)(hdr + 1);
|
|
||||||
for (int i = 0; i < hdr->invalidates_blocks; i++)
|
|
||||||
{
|
|
||||||
blk.invalidates_blocks.push_back(hdr_inv[i]);
|
|
||||||
heap_invalidated_block_seq.insert(hdr_inv[i]);
|
|
||||||
}
|
|
||||||
heap_block_by_seq[hdr->seq_num] = block_offset;
|
|
||||||
// Process sub-blocks
|
|
||||||
uint64_t heap_entry_size = sizeof(heap_meta_entry_t) + bs->dsk.clean_dyn_size;
|
|
||||||
for (uint64_t pos = sizeof(heap_meta_block_header_t); pos < bs->dsk.meta_block_size-4; pos += heap_entry_size)
|
|
||||||
{
|
|
||||||
heap_meta_entry_t *diskentry = (heap_meta_entry_t*)(buf + pos);
|
|
||||||
if (!diskentry->oid.inode || !diskentry->version)
|
|
||||||
{
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
auto & clean_db = bs->clean_db_shard(diskentry->oid);
|
|
||||||
auto mementry = (clean_entry_heap_t*)(&clean_db[diskentry->oid]);
|
|
||||||
bool exists = mementry->version != 0;
|
|
||||||
if (exists && mementry->version >= diskentry->version)
|
|
||||||
{
|
|
||||||
if (mementry->version == diskentry->version)
|
|
||||||
{
|
|
||||||
// Voluntarily allow duplicates of in-memory entries with different
|
|
||||||
// bitmaps to support checksum updates with hole-punching
|
|
||||||
int old_count = bitmap_count_ones(mementry->bitmap, bs->dsk.clean_entry_bitmap_size);
|
|
||||||
int new_count = bitmap_count_ones(diskentry->bitmap, bs->dsk.clean_entry_bitmap_size);
|
|
||||||
if (old_count < new_count)
|
|
||||||
{
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
blk.used_space += heap_entry_size;
|
|
||||||
if (exists && mementry->location != UINT64_MAX)
|
|
||||||
{
|
|
||||||
// free the previous block
|
|
||||||
uint64_t old_clean_loc = mementry->location >> bs->dsk.block_order;
|
|
||||||
#ifdef BLOCKSTORE_DEBUG
|
|
||||||
printf("Free block %ju from %jx:%jx v%ju\n", 1+old_clean_loc,
|
|
||||||
diskentry->oid.inode, diskentry->oid.stripe, mementry->version);
|
|
||||||
#endif
|
|
||||||
bs->data_alloc->set(old_clean_loc, false);
|
|
||||||
bs->inode_space_stats[diskentry->oid.inode] -= bs->dsk.data_block_size;
|
|
||||||
bs->used_blocks--;
|
|
||||||
bs->heap_meta.blocks[mementry->meta_block].used_space -= heap_entry_size;
|
|
||||||
}
|
|
||||||
if (diskentry->location != UINT64_MAX)
|
|
||||||
{
|
|
||||||
bs->data_alloc->set(diskentry->location >> bs->dsk.block_order, true);
|
|
||||||
bs->inode_space_stats[diskentry->oid.inode] += bs->dsk.data_block_size;
|
|
||||||
bs->used_blocks++;
|
|
||||||
#ifdef BLOCKSTORE_DEBUG
|
|
||||||
printf("Allocate block (heap entry) %ju: %jx:%jx v%ju\n", 1 + (diskentry->location >> bs->dsk.block_order),
|
|
||||||
diskentry->oid.inode, diskentry->oid.stripe, diskentry->version);
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
mementry->version = diskentry->version;
|
|
||||||
mementry->location = diskentry->location;
|
|
||||||
mementry->meta_block = block_offset / bs->dsk.meta_block_size;
|
|
||||||
mementry->block_offset = block_offset % bs->dsk.meta_block_size;
|
|
||||||
if (exists)
|
|
||||||
{
|
|
||||||
mementry->prev_versions++;
|
|
||||||
}
|
|
||||||
// Extra data: 2 bitmaps + checksums or just 2 bitmaps if inmemory_meta is disabled
|
|
||||||
memcpy(&mementry->bitmap, &diskentry->bitmap, bs->inmemory_meta ? bs->dsk.clean_dyn_size : 2*bs->dsk.clean_entry_bitmap_size);
|
|
||||||
entries_loaded++;
|
|
||||||
}
|
|
||||||
// We have to zero out headers of invalidated blocks, but we'll do it later
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
blockstore_init_journal::blockstore_init_journal(blockstore_impl_t *bs)
|
blockstore_init_journal::blockstore_init_journal(blockstore_impl_t *bs)
|
||||||
{
|
{
|
||||||
this->bs = bs;
|
this->bs = bs;
|
||||||
|
|
|
@ -28,13 +28,7 @@ class blockstore_init_meta
|
||||||
unsigned entries_per_block = 0;
|
unsigned entries_per_block = 0;
|
||||||
int i = 0, j = 0;
|
int i = 0, j = 0;
|
||||||
std::vector<uint64_t> entries_to_zero;
|
std::vector<uint64_t> entries_to_zero;
|
||||||
|
bool handle_meta_block(uint8_t *buf, uint64_t count, uint64_t done_cnt);
|
||||||
std::map<uint64_t, heap_block_num_t> heap_block_by_seq;
|
|
||||||
std::set<uint64_t> heap_invalidated_block_seq;
|
|
||||||
std::vector<heap_block_num_t> heap_invalidated_block_nums;
|
|
||||||
|
|
||||||
bool handle_array_meta_block(uint8_t *buf, uint64_t block_offset);
|
|
||||||
bool handle_heap_meta_block(uint8_t *buf, uint64_t block_offset);
|
|
||||||
void handle_event(ring_data_t *data, int buf_num);
|
void handle_event(ring_data_t *data, int buf_num);
|
||||||
public:
|
public:
|
||||||
blockstore_init_meta(blockstore_impl_t *bs);
|
blockstore_init_meta(blockstore_impl_t *bs);
|
||||||
|
|
|
@ -111,10 +111,6 @@ void blockstore_impl_t::parse_config(blockstore_config_t & config, bool init)
|
||||||
{
|
{
|
||||||
metadata_buf_size = 4*1024*1024;
|
metadata_buf_size = 4*1024*1024;
|
||||||
}
|
}
|
||||||
if (metadata_buf_size % dsk.meta_block_size)
|
|
||||||
{
|
|
||||||
metadata_buf_size = ((metadata_buf_size+dsk.meta_block_size-1) / dsk.meta_block_size) * dsk.meta_block_size;
|
|
||||||
}
|
|
||||||
if (dsk.meta_device == dsk.data_device)
|
if (dsk.meta_device == dsk.data_device)
|
||||||
{
|
{
|
||||||
disable_meta_fsync = disable_data_fsync;
|
disable_meta_fsync = disable_data_fsync;
|
||||||
|
|
|
@ -148,14 +148,10 @@ int blockstore_impl_t::fulfill_read(blockstore_op_t *read_op,
|
||||||
return r;
|
return r;
|
||||||
}
|
}
|
||||||
|
|
||||||
uint8_t* blockstore_impl_t::get_clean_entry_bitmap(blockstore_clean_db_t::iterator clean_it, int offset)
|
uint8_t* blockstore_impl_t::get_clean_entry_bitmap(uint64_t block_loc, int offset)
|
||||||
{
|
{
|
||||||
if (dsk.meta_format == BLOCKSTORE_META_FORMAT_HEAP)
|
|
||||||
{
|
|
||||||
return ((uint8_t*)&clean_it->second) + sizeof(clean_entry_heap_t) + offset;
|
|
||||||
}
|
|
||||||
uint8_t *clean_entry_bitmap;
|
uint8_t *clean_entry_bitmap;
|
||||||
uint64_t meta_loc = clean_it->second.location >> dsk.block_order;
|
uint64_t meta_loc = block_loc >> dsk.block_order;
|
||||||
if (inmemory_meta)
|
if (inmemory_meta)
|
||||||
{
|
{
|
||||||
uint64_t sector = (meta_loc / (dsk.meta_block_size / dsk.clean_entry_size)) * dsk.meta_block_size;
|
uint64_t sector = (meta_loc / (dsk.meta_block_size / dsk.clean_entry_size)) * dsk.meta_block_size;
|
||||||
|
@ -163,9 +159,7 @@ uint8_t* blockstore_impl_t::get_clean_entry_bitmap(blockstore_clean_db_t::iterat
|
||||||
clean_entry_bitmap = ((uint8_t*)metadata_buffer + sector + pos*dsk.clean_entry_size + sizeof(clean_disk_entry) + offset);
|
clean_entry_bitmap = ((uint8_t*)metadata_buffer + sector + pos*dsk.clean_entry_size + sizeof(clean_disk_entry) + offset);
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
|
||||||
clean_entry_bitmap = (uint8_t*)(clean_bitmaps + meta_loc*2*dsk.clean_entry_bitmap_size + offset);
|
clean_entry_bitmap = (uint8_t*)(clean_bitmaps + meta_loc*2*dsk.clean_entry_bitmap_size + offset);
|
||||||
}
|
|
||||||
return clean_entry_bitmap;
|
return clean_entry_bitmap;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -439,7 +433,7 @@ int blockstore_impl_t::dequeue_read(blockstore_op_t *read_op)
|
||||||
if (!IS_JOURNAL(dirty.state))
|
if (!IS_JOURNAL(dirty.state))
|
||||||
{
|
{
|
||||||
// Read from data disk, possibly checking checksums
|
// Read from data disk, possibly checking checksums
|
||||||
if (!fulfill_clean_read_journal(read_op, fulfilled, bmp_ptr, dyn_data,
|
if (!fulfill_clean_read(read_op, fulfilled, bmp_ptr, dyn_data,
|
||||||
dirty.offset, dirty.offset+dirty.len, dirty.location, dirty_it->first.version))
|
dirty.offset, dirty.offset+dirty.len, dirty.location, dirty_it->first.version))
|
||||||
{
|
{
|
||||||
goto undo_read;
|
goto undo_read;
|
||||||
|
@ -470,13 +464,14 @@ int blockstore_impl_t::dequeue_read(blockstore_op_t *read_op)
|
||||||
result_version = clean_it->second.version;
|
result_version = clean_it->second.version;
|
||||||
if (read_op->bitmap)
|
if (read_op->bitmap)
|
||||||
{
|
{
|
||||||
void *bmp_ptr = get_clean_entry_bitmap(clean_it, dsk.clean_entry_bitmap_size);
|
void *bmp_ptr = get_clean_entry_bitmap(clean_it->second.location, dsk.clean_entry_bitmap_size);
|
||||||
memcpy(read_op->bitmap, bmp_ptr, dsk.clean_entry_bitmap_size);
|
memcpy(read_op->bitmap, bmp_ptr, dsk.clean_entry_bitmap_size);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (fulfilled < read_op->len)
|
if (fulfilled < read_op->len)
|
||||||
{
|
{
|
||||||
if (!fulfill_clean_read_meta(read_op, fulfilled, clean_it))
|
if (!fulfill_clean_read(read_op, fulfilled, NULL, NULL, 0, dsk.data_block_size,
|
||||||
|
clean_it->second.location, clean_it->second.version))
|
||||||
{
|
{
|
||||||
goto undo_read;
|
goto undo_read;
|
||||||
}
|
}
|
||||||
|
@ -586,22 +581,40 @@ int blockstore_impl_t::pad_journal_read(std::vector<copy_buffer_t> & rv, copy_bu
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool blockstore_impl_t::fulfill_clean_read_journal(blockstore_op_t *read_op, uint64_t & fulfilled,
|
bool blockstore_impl_t::fulfill_clean_read(blockstore_op_t *read_op, uint64_t & fulfilled,
|
||||||
uint8_t *clean_entry_bitmap, int *dyn_data, uint32_t item_start, uint32_t item_end, uint64_t clean_loc, uint64_t clean_ver)
|
uint8_t *clean_entry_bitmap, int *dyn_data, uint32_t item_start, uint32_t item_end, uint64_t clean_loc, uint64_t clean_ver)
|
||||||
{
|
{
|
||||||
|
bool from_journal = clean_entry_bitmap != NULL;
|
||||||
|
if (!clean_entry_bitmap)
|
||||||
|
{
|
||||||
|
// NULL clean_entry_bitmap means we're reading from data, not from the journal,
|
||||||
|
// and the bitmap location is obvious
|
||||||
|
clean_entry_bitmap = get_clean_entry_bitmap(clean_loc, 0);
|
||||||
|
}
|
||||||
if (dsk.csum_block_size > dsk.bitmap_granularity)
|
if (dsk.csum_block_size > dsk.bitmap_granularity)
|
||||||
{
|
{
|
||||||
auto & rv = PRIV(read_op)->read_vec;
|
auto & rv = PRIV(read_op)->read_vec;
|
||||||
int req = fill_partial_checksum_blocks(rv, fulfilled, clean_entry_bitmap, dyn_data, true,
|
int req = fill_partial_checksum_blocks(rv, fulfilled, clean_entry_bitmap, dyn_data, from_journal,
|
||||||
(uint8_t*)read_op->buf, read_op->offset, read_op->offset+read_op->len);
|
(uint8_t*)read_op->buf, read_op->offset, read_op->offset+read_op->len);
|
||||||
|
if (!inmemory_meta && !from_journal && req > 0)
|
||||||
|
{
|
||||||
|
// Read checksums from disk
|
||||||
|
uint8_t *csum_buf = read_clean_meta_block(read_op, clean_loc, rv.size()-req);
|
||||||
|
for (int i = req; i > 0; i--)
|
||||||
|
{
|
||||||
|
rv[rv.size()-i].csum_buf = csum_buf;
|
||||||
|
}
|
||||||
|
}
|
||||||
for (int i = req; i > 0; i--)
|
for (int i = req; i > 0; i--)
|
||||||
{
|
{
|
||||||
if (!read_checksum_block(read_op, i, fulfilled, clean_loc))
|
if (!read_checksum_block(read_op, i, fulfilled, clean_loc))
|
||||||
|
{
|
||||||
return false;
|
return false;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
PRIV(read_op)->clean_block_used = req > 0;
|
PRIV(read_op)->clean_block_used = req > 0;
|
||||||
}
|
}
|
||||||
else
|
else if (from_journal)
|
||||||
{
|
{
|
||||||
// Don't scan bitmap - journal writes don't have holes (internal bitmap)!
|
// Don't scan bitmap - journal writes don't have holes (internal bitmap)!
|
||||||
uint8_t *csum = !dsk.csum_block_size ? 0 : (clean_entry_bitmap + dsk.clean_entry_bitmap_size +
|
uint8_t *csum = !dsk.csum_block_size ? 0 : (clean_entry_bitmap + dsk.clean_entry_bitmap_size +
|
||||||
|
@ -622,43 +635,6 @@ bool blockstore_impl_t::fulfill_clean_read_journal(blockstore_op_t *read_op, uin
|
||||||
assert(fulfill_read(read_op, fulfilled, item_end, dsk.data_block_size, (BS_ST_DELETE | BS_ST_STABLE), 0, 0, 0, NULL, NULL));
|
assert(fulfill_read(read_op, fulfilled, item_end, dsk.data_block_size, (BS_ST_DELETE | BS_ST_STABLE), 0, 0, 0, NULL, NULL));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// Increment reference counter if clean data is being read from the disk
|
|
||||||
if (PRIV(read_op)->clean_block_used)
|
|
||||||
{
|
|
||||||
auto & uo = used_clean_objects[clean_loc];
|
|
||||||
uo.refs++;
|
|
||||||
if (dsk.csum_block_size && flusher->is_mutated(clean_loc))
|
|
||||||
uo.was_changed = true;
|
|
||||||
PRIV(read_op)->clean_block_used = clean_loc;
|
|
||||||
}
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
bool blockstore_impl_t::fulfill_clean_read_meta(blockstore_op_t *read_op, uint64_t & fulfilled, blockstore_clean_db_t::iterator clean_it)
|
|
||||||
{
|
|
||||||
uint8_t *clean_entry_bitmap = get_clean_entry_bitmap(clean_it, 0);
|
|
||||||
uint64_t clean_loc = clean_it->second.location;
|
|
||||||
if (dsk.csum_block_size > dsk.bitmap_granularity)
|
|
||||||
{
|
|
||||||
auto & rv = PRIV(read_op)->read_vec;
|
|
||||||
int req = fill_partial_checksum_blocks(rv, fulfilled, clean_entry_bitmap, NULL, false,
|
|
||||||
(uint8_t*)read_op->buf, read_op->offset, read_op->offset+read_op->len);
|
|
||||||
if (!inmemory_meta && req > 0)
|
|
||||||
{
|
|
||||||
// Read checksums from disk
|
|
||||||
uint8_t *csum_buf = read_clean_meta_block(read_op, clean_it, rv.size()-req);
|
|
||||||
for (int i = req; i > 0; i--)
|
|
||||||
{
|
|
||||||
rv[rv.size()-i].csum_buf = csum_buf;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
for (int i = req; i > 0; i--)
|
|
||||||
{
|
|
||||||
if (!read_checksum_block(read_op, i, fulfilled, clean_loc))
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
PRIV(read_op)->clean_block_used = req > 0;
|
|
||||||
}
|
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
bool csum_done = !dsk.csum_block_size || inmemory_meta;
|
bool csum_done = !dsk.csum_block_size || inmemory_meta;
|
||||||
|
@ -686,13 +662,13 @@ bool blockstore_impl_t::fulfill_clean_read_meta(blockstore_op_t *read_op, uint64
|
||||||
if (!csum_done)
|
if (!csum_done)
|
||||||
{
|
{
|
||||||
// Read checksums from disk
|
// Read checksums from disk
|
||||||
csum_buf = read_clean_meta_block(read_op, clean_it, PRIV(read_op)->read_vec.size());
|
csum_buf = read_clean_meta_block(read_op, clean_loc, PRIV(read_op)->read_vec.size());
|
||||||
csum_done = true;
|
csum_done = true;
|
||||||
}
|
}
|
||||||
uint8_t *csum = !dsk.csum_block_size ? 0 : (csum_buf + 2*dsk.clean_entry_bitmap_size + bmp_start*(dsk.data_csum_type & 0xFF));
|
uint8_t *csum = !dsk.csum_block_size ? 0 : (csum_buf + 2*dsk.clean_entry_bitmap_size + bmp_start*(dsk.data_csum_type & 0xFF));
|
||||||
if (!fulfill_read(read_op, fulfilled, bmp_start * dsk.bitmap_granularity,
|
if (!fulfill_read(read_op, fulfilled, bmp_start * dsk.bitmap_granularity,
|
||||||
bmp_end * dsk.bitmap_granularity, (BS_ST_BIG_WRITE | BS_ST_STABLE), 0,
|
bmp_end * dsk.bitmap_granularity, (BS_ST_BIG_WRITE | BS_ST_STABLE), 0,
|
||||||
clean_loc + bmp_start * dsk.bitmap_granularity, 0, csum, NULL))
|
clean_loc + bmp_start * dsk.bitmap_granularity, 0, csum, dyn_data))
|
||||||
{
|
{
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
@ -712,22 +688,11 @@ bool blockstore_impl_t::fulfill_clean_read_meta(blockstore_op_t *read_op, uint64
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
uint8_t* blockstore_impl_t::read_clean_meta_block(blockstore_op_t *op, blockstore_clean_db_t::iterator clean_it, int rv_pos)
|
uint8_t* blockstore_impl_t::read_clean_meta_block(blockstore_op_t *op, uint64_t clean_loc, int rv_pos)
|
||||||
{
|
{
|
||||||
uint64_t sector, pos;
|
|
||||||
auto & rv = PRIV(op)->read_vec;
|
auto & rv = PRIV(op)->read_vec;
|
||||||
if (dsk.meta_format == BLOCKSTORE_META_FORMAT_HEAP)
|
auto sector = ((clean_loc >> dsk.block_order) / (dsk.meta_block_size / dsk.clean_entry_size)) * dsk.meta_block_size;
|
||||||
{
|
auto pos = ((clean_loc >> dsk.block_order) % (dsk.meta_block_size / dsk.clean_entry_size)) * dsk.clean_entry_size;
|
||||||
auto clean_heap_entry = (clean_entry_heap_t*)(&clean_it->second);
|
|
||||||
sector = clean_heap_entry->meta_block * dsk.meta_block_size;
|
|
||||||
pos = clean_heap_entry->block_offset;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
auto clean_loc = clean_it->second.location;
|
|
||||||
sector = ((clean_loc >> dsk.block_order) / (dsk.meta_block_size / dsk.clean_entry_size)) * dsk.meta_block_size;
|
|
||||||
pos = ((clean_loc >> dsk.block_order) % (dsk.meta_block_size / dsk.clean_entry_size)) * dsk.clean_entry_size;
|
|
||||||
}
|
|
||||||
uint8_t *buf = (uint8_t*)memalign_or_die(MEM_ALIGNMENT, dsk.meta_block_size);
|
uint8_t *buf = (uint8_t*)memalign_or_die(MEM_ALIGNMENT, dsk.meta_block_size);
|
||||||
rv.insert(rv.begin()+rv_pos, (copy_buffer_t){
|
rv.insert(rv.begin()+rv_pos, (copy_buffer_t){
|
||||||
.copy_flags = COPY_BUF_META_BLOCK|COPY_BUF_CSUM_FILL,
|
.copy_flags = COPY_BUF_META_BLOCK|COPY_BUF_CSUM_FILL,
|
||||||
|
@ -842,6 +807,11 @@ bool blockstore_impl_t::verify_clean_padded_checksums(blockstore_op_t *op, uint6
|
||||||
if (from_journal)
|
if (from_journal)
|
||||||
return verify_padded_checksums(dyn_data, dyn_data + dsk.clean_entry_bitmap_size, offset, iov, n_iov, bad_block_cb);
|
return verify_padded_checksums(dyn_data, dyn_data + dsk.clean_entry_bitmap_size, offset, iov, n_iov, bad_block_cb);
|
||||||
clean_loc = (clean_loc >> dsk.block_order) << dsk.block_order;
|
clean_loc = (clean_loc >> dsk.block_order) << dsk.block_order;
|
||||||
|
if (!dyn_data)
|
||||||
|
{
|
||||||
|
assert(inmemory_meta);
|
||||||
|
dyn_data = get_clean_entry_bitmap(clean_loc, 0);
|
||||||
|
}
|
||||||
return verify_padded_checksums(dyn_data, dyn_data + 2*dsk.clean_entry_bitmap_size, offset, iov, n_iov, bad_block_cb);
|
return verify_padded_checksums(dyn_data, dyn_data + 2*dsk.clean_entry_bitmap_size, offset, iov, n_iov, bad_block_cb);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -899,18 +869,8 @@ void blockstore_impl_t::handle_read_event(ring_data_t *data, blockstore_op_t *op
|
||||||
auto & uo = used_clean_objects.at((rv[i].disk_offset >> dsk.block_order) << dsk.block_order);
|
auto & uo = used_clean_objects.at((rv[i].disk_offset >> dsk.block_order) << dsk.block_order);
|
||||||
if (!uo.was_changed)
|
if (!uo.was_changed)
|
||||||
{
|
{
|
||||||
bool from_journal = (rv[i].copy_flags & COPY_BUF_JOURNALED_BIG);
|
|
||||||
auto csum_buf = rv[i].csum_buf;
|
|
||||||
if (!from_journal && !csum_buf)
|
|
||||||
{
|
|
||||||
assert(inmemory_meta);
|
|
||||||
auto & clean_db = clean_db_shard(op->oid);
|
|
||||||
auto clean_it = clean_db.find(op->oid);
|
|
||||||
assert(clean_it != clean_db.end());
|
|
||||||
csum_buf = get_clean_entry_bitmap(clean_it, 0);
|
|
||||||
}
|
|
||||||
verify_clean_padded_checksums(
|
verify_clean_padded_checksums(
|
||||||
op, rv[i].disk_offset, csum_buf, from_journal, iov, n_iov,
|
op, rv[i].disk_offset, rv[i].csum_buf, (rv[i].copy_flags & COPY_BUF_JOURNALED_BIG), iov, n_iov,
|
||||||
[&](uint32_t bad_block, uint32_t calc_csum, uint32_t stored_csum)
|
[&](uint32_t bad_block, uint32_t calc_csum, uint32_t stored_csum)
|
||||||
{
|
{
|
||||||
ok = false;
|
ok = false;
|
||||||
|
@ -1059,7 +1019,7 @@ int blockstore_impl_t::read_bitmap(object_id oid, uint64_t target_version, void
|
||||||
*result_version = clean_it->second.version;
|
*result_version = clean_it->second.version;
|
||||||
if (bitmap)
|
if (bitmap)
|
||||||
{
|
{
|
||||||
void *bmp_ptr = get_clean_entry_bitmap(clean_it, dsk.clean_entry_bitmap_size);
|
void *bmp_ptr = get_clean_entry_bitmap(clean_it->second.location, dsk.clean_entry_bitmap_size);
|
||||||
memcpy(bitmap, bmp_ptr, dsk.clean_entry_bitmap_size);
|
memcpy(bitmap, bmp_ptr, dsk.clean_entry_bitmap_size);
|
||||||
}
|
}
|
||||||
return 0;
|
return 0;
|
||||||
|
|
|
@ -57,7 +57,7 @@ bool blockstore_impl_t::enqueue_write(blockstore_op_t *op)
|
||||||
version = clean_it->second.version + 1;
|
version = clean_it->second.version + 1;
|
||||||
if (!is_del)
|
if (!is_del)
|
||||||
{
|
{
|
||||||
void *bmp_ptr = get_clean_entry_bitmap(clean_it, dsk.clean_entry_bitmap_size);
|
void *bmp_ptr = get_clean_entry_bitmap(clean_it->second.location, dsk.clean_entry_bitmap_size);
|
||||||
memcpy(dyn_ptr, bmp_ptr, dsk.clean_entry_bitmap_size);
|
memcpy(dyn_ptr, bmp_ptr, dsk.clean_entry_bitmap_size);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -341,7 +341,7 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
|
||||||
cancel_all_writes(op, dirty_it, -ENOSPC);
|
cancel_all_writes(op, dirty_it, -ENOSPC);
|
||||||
return 2;
|
return 2;
|
||||||
}
|
}
|
||||||
if (inmemory_meta && dsk.meta_format != BLOCKSTORE_META_FORMAT_HEAP)
|
if (inmemory_meta)
|
||||||
{
|
{
|
||||||
// Check once more that metadata entry is zeroed (the reverse means a bug or corruption)
|
// Check once more that metadata entry is zeroed (the reverse means a bug or corruption)
|
||||||
uint64_t sector = (loc / (dsk.meta_block_size / dsk.clean_entry_size)) * dsk.meta_block_size;
|
uint64_t sector = (loc / (dsk.meta_block_size / dsk.clean_entry_size)) * dsk.meta_block_size;
|
||||||
|
|
|
@ -33,8 +33,12 @@ void osd_messenger_t::read_requests()
|
||||||
auto iothread = iothreads.size() ? iothreads[peer_fd % iothreads.size()] : NULL;
|
auto iothread = iothreads.size() ? iothreads[peer_fd % iothreads.size()] : NULL;
|
||||||
io_uring_sqe sqe_local;
|
io_uring_sqe sqe_local;
|
||||||
ring_data_t data_local;
|
ring_data_t data_local;
|
||||||
sqe_local.user_data = (uint64_t)&data_local;
|
|
||||||
io_uring_sqe* sqe = (iothread ? &sqe_local : ringloop->get_sqe());
|
io_uring_sqe* sqe = (iothread ? &sqe_local : ringloop->get_sqe());
|
||||||
|
if (iothread)
|
||||||
|
{
|
||||||
|
sqe_local = { .user_data = (uint64_t)&data_local };
|
||||||
|
data_local = {};
|
||||||
|
}
|
||||||
if (!sqe)
|
if (!sqe)
|
||||||
{
|
{
|
||||||
cl->read_msg.msg_iovlen = 0;
|
cl->read_msg.msg_iovlen = 0;
|
||||||
|
|
|
@ -194,12 +194,14 @@ bool osd_messenger_t::try_send(osd_client_t *cl)
|
||||||
auto iothread = iothreads.size() ? iothreads[peer_fd % iothreads.size()] : NULL;
|
auto iothread = iothreads.size() ? iothreads[peer_fd % iothreads.size()] : NULL;
|
||||||
io_uring_sqe sqe_local;
|
io_uring_sqe sqe_local;
|
||||||
ring_data_t data_local;
|
ring_data_t data_local;
|
||||||
sqe_local.user_data = (uint64_t)&data_local;
|
|
||||||
io_uring_sqe* sqe = (iothread ? &sqe_local : ringloop->get_sqe());
|
io_uring_sqe* sqe = (iothread ? &sqe_local : ringloop->get_sqe());
|
||||||
if (!sqe)
|
if (iothread)
|
||||||
{
|
{
|
||||||
return false;
|
sqe_local = { .user_data = (uint64_t)&data_local };
|
||||||
|
data_local = {};
|
||||||
}
|
}
|
||||||
|
if (!sqe)
|
||||||
|
return false;
|
||||||
cl->write_msg.msg_iov = cl->send_list.data();
|
cl->write_msg.msg_iov = cl->send_list.data();
|
||||||
cl->write_msg.msg_iovlen = cl->send_list.size() < IOV_MAX ? cl->send_list.size() : IOV_MAX;
|
cl->write_msg.msg_iovlen = cl->send_list.size() < IOV_MAX ? cl->send_list.size() : IOV_MAX;
|
||||||
cl->refs++;
|
cl->refs++;
|
||||||
|
|
|
@ -70,7 +70,7 @@ struct rm_osd_t
|
||||||
{
|
{
|
||||||
if (parent->cli->st_cli.peer_states.find(osd_id) != parent->cli->st_cli.peer_states.end())
|
if (parent->cli->st_cli.peer_states.find(osd_id) != parent->cli->st_cli.peer_states.end())
|
||||||
{
|
{
|
||||||
is_warning = true;
|
is_warning = !allow_up;
|
||||||
still_up.push_back(osd_id);
|
still_up.push_back(osd_id);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -278,6 +278,7 @@ class osd_t
|
||||||
void handle_peers();
|
void handle_peers();
|
||||||
bool check_peer_config(osd_client_t *cl, json11::Json conf);
|
bool check_peer_config(osd_client_t *cl, json11::Json conf);
|
||||||
void repeer_pgs(osd_num_t osd_num);
|
void repeer_pgs(osd_num_t osd_num);
|
||||||
|
void repeer_pg(pg_t & pg);
|
||||||
void start_pg_peering(pg_t & pg);
|
void start_pg_peering(pg_t & pg);
|
||||||
void drop_dirty_pg_connections(pool_pg_num_t pg);
|
void drop_dirty_pg_connections(pool_pg_num_t pg);
|
||||||
void record_pg_lock(pg_t & pg, osd_num_t peer_osd, uint64_t pg_state);
|
void record_pg_lock(pg_t & pg, osd_num_t peer_osd, uint64_t pg_state);
|
||||||
|
|
|
@ -432,9 +432,16 @@ void osd_t::apply_pg_locks_localize_only()
|
||||||
}
|
}
|
||||||
auto & pool_cfg = pool_it->second;
|
auto & pool_cfg = pool_it->second;
|
||||||
auto & pg = pp.second;
|
auto & pg = pp.second;
|
||||||
|
auto old_disable_pg_locks = pg.disable_pg_locks;
|
||||||
pg.disable_pg_locks = pg_locks_localize_only &&
|
pg.disable_pg_locks = pg_locks_localize_only &&
|
||||||
pool_cfg.scheme == POOL_SCHEME_REPLICATED &&
|
pool_cfg.scheme == POOL_SCHEME_REPLICATED &&
|
||||||
pool_cfg.local_reads == POOL_LOCAL_READ_PRIMARY;
|
pool_cfg.local_reads == POOL_LOCAL_READ_PRIMARY;
|
||||||
|
if (!pg.disable_pg_locks && old_disable_pg_locks)
|
||||||
|
{
|
||||||
|
// Relock PG
|
||||||
|
printf("[PG %u/%u] Repeer to enable PG locks\n", pg.pool_id, pg.pg_num);
|
||||||
|
repeer_pg(pg);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -104,21 +104,26 @@ void osd_t::repeer_pgs(osd_num_t peer_osd)
|
||||||
{
|
{
|
||||||
// Repeer this pg
|
// Repeer this pg
|
||||||
printf("[PG %u/%u] Repeer because of OSD %ju\n", pg.pool_id, pg.pg_num, peer_osd);
|
printf("[PG %u/%u] Repeer because of OSD %ju\n", pg.pool_id, pg.pg_num, peer_osd);
|
||||||
if (!(pg.state & (PG_ACTIVE | PG_REPEERING)) || pg.can_repeer())
|
repeer_pg(pg);
|
||||||
{
|
|
||||||
start_pg_peering(pg);
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
// Stop accepting new operations, wait for current ones to finish or fail
|
|
||||||
pg.state = pg.state & ~PG_ACTIVE | PG_REPEERING;
|
|
||||||
report_pg_state(pg);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void osd_t::repeer_pg(pg_t & pg)
|
||||||
|
{
|
||||||
|
if (!(pg.state & (PG_ACTIVE | PG_REPEERING)) || pg.can_repeer())
|
||||||
|
{
|
||||||
|
start_pg_peering(pg);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
// Stop accepting new operations, wait for current ones to finish or fail
|
||||||
|
pg.state = pg.state & ~PG_ACTIVE | PG_REPEERING;
|
||||||
|
report_pg_state(pg);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Reset PG state (when peering or stopping)
|
// Reset PG state (when peering or stopping)
|
||||||
void osd_t::reset_pg(pg_t & pg)
|
void osd_t::reset_pg(pg_t & pg)
|
||||||
{
|
{
|
||||||
|
|
|
@ -125,6 +125,8 @@ void ring_loop_t::loop()
|
||||||
if (cqe->flags & IORING_CQE_F_MORE)
|
if (cqe->flags & IORING_CQE_F_MORE)
|
||||||
{
|
{
|
||||||
// There will be a second notification
|
// There will be a second notification
|
||||||
|
if (mt)
|
||||||
|
mu.unlock();
|
||||||
d->res = cqe->res;
|
d->res = cqe->res;
|
||||||
d->more = true;
|
d->more = true;
|
||||||
if (d->callback)
|
if (d->callback)
|
||||||
|
|
|
@ -59,6 +59,7 @@ SCHEME=ec IMMEDIATE_COMMIT=1 ./test_rebalance_verify.sh
|
||||||
|
|
||||||
./test_write.sh
|
./test_write.sh
|
||||||
SCHEME=xor ./test_write.sh
|
SCHEME=xor ./test_write.sh
|
||||||
|
TEST_NAME=iothreads GLOBAL_CONFIG=',"client_iothread_count":4' ./test_write.sh
|
||||||
|
|
||||||
./test_write_no_same.sh
|
./test_write_no_same.sh
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue