forked from vitalif/vitastor
Compare commits
91 Commits
csi-ext4-n
...
v1.4.8
Author | SHA1 | Date | |
---|---|---|---|
38b8963330 | |||
77167e2920 | |||
5af23672d0 | |||
6bf1f539a6 | |||
02d1f16bbd | |||
fc413038d1 | |||
1bc0b5aab3 | |||
5e934264cf | |||
f20564b44b | |||
b3c15db331 | |||
685bcd6ef9 | |||
3eb389b321 | |||
3d16cde23c | |||
c6406d67fc | |||
f87964861d | |||
62a4f45160 | |||
7048228678 | |||
ea73857450 | |||
6cfe38ec04 | |||
7ae5766fdb | |||
f882c7dd87 | |||
26dd863c8d | |||
2ae859fbc6 | |||
f6cd9f9153 | |||
8389c0f33b | |||
9db2196aef | |||
8d6ae662fe | |||
c777a0041a | |||
2947ea93e8 | |||
978bdc128a | |||
bb2f395f1e | |||
b127da40f7 | |||
ca34a6047a | |||
38ba76e893 | |||
1e3c4edea0 | |||
e7ac855b07 | |||
c53357ac45 | |||
27e9f244ec | |||
8e25a28a08 | |||
5d3317e4f2 | |||
016115c0d4 | |||
e026de95d5 | |||
77c10fd1f8 | |||
581d02e581 | |||
f03a9db4d9 | |||
cb9c30bc31 | |||
a86a380d20 | |||
d2b43cb118 | |||
cc76e6876b | |||
1cec62d25d | |||
1c322b33ed | |||
d27524f441 | |||
ba55f91409 | |||
80aac39513 | |||
2aa5aa7ab6 | |||
3ca3b8a8d8 | |||
2cf649eba6 | |||
5935640a4a | |||
d00d4dbac0 | |||
5d9d6f32a0 | |||
5280d1d561 | |||
317b0feb0a | |||
247f0552db | |||
2f228fa96a | |||
2f6b9c0306 | |||
48b5f871e0 | |||
c17f76a3e4 | |||
a6ab54b1ba | |||
99ee8596ea | |||
c4928e6ecd | |||
ec7dcd1be5 | |||
e600bbc151 | |||
8b8c1179a7 | |||
d5a6fa6dd7 | |||
f757a35a8d | |||
1edf86ed26 | |||
5ca7cde612 | |||
751935ddd8 | |||
d84dee7098 | |||
dcc76eee15 | |||
2f38adeb3d | |||
f72f14e6a7 | |||
1299373988 | |||
178bb0e701 | |||
4ece4dfdd0 | |||
95631773b6 | |||
7239cfb91a | |||
7cea642f4a | |||
dc615403d9 | |||
1a704e06ab | |||
575475de71 |
@@ -395,7 +395,7 @@ jobs:
|
||||
steps:
|
||||
- name: Run test
|
||||
id: test
|
||||
timeout-minutes: 3
|
||||
timeout-minutes: 6
|
||||
run: SCHEME=ec /root/vitastor/tests/test_snapshot_chain.sh
|
||||
- name: Print logs
|
||||
if: always() && steps.test.outcome == 'failure'
|
||||
@@ -532,6 +532,24 @@ jobs:
|
||||
echo ""
|
||||
done
|
||||
|
||||
test_switch_primary:
|
||||
runs-on: ubuntu-latest
|
||||
needs: build
|
||||
container: ${{env.TEST_IMAGE}}:${{github.sha}}
|
||||
steps:
|
||||
- name: Run test
|
||||
id: test
|
||||
timeout-minutes: 3
|
||||
run: /root/vitastor/tests/test_switch_primary.sh
|
||||
- name: Print logs
|
||||
if: always() && steps.test.outcome == 'failure'
|
||||
run: |
|
||||
for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
|
||||
echo "-------- $i --------"
|
||||
cat $i
|
||||
echo ""
|
||||
done
|
||||
|
||||
test_write:
|
||||
runs-on: ubuntu-latest
|
||||
needs: build
|
||||
|
@@ -39,6 +39,10 @@ for my $line (<>)
|
||||
$test_name .= '_'.lc($1).'_'.$2;
|
||||
}
|
||||
}
|
||||
if ($test_name eq 'test_snapshot_chain_ec')
|
||||
{
|
||||
$timeout = 6;
|
||||
}
|
||||
$line =~ s!\./test_!/root/vitastor/tests/test_!;
|
||||
# Gitea CI doesn't support artifacts yet, lol
|
||||
#- name: Upload results
|
||||
|
115
CLA-en.md
Normal file
115
CLA-en.md
Normal file
@@ -0,0 +1,115 @@
|
||||
## Contributor License Agreement
|
||||
|
||||
> This Agreement is made in the Russian and English languages. **The English
|
||||
text of Agreement is for informational purposes only** and is not binding
|
||||
for the Parties.
|
||||
>
|
||||
> In the event of a conflict between the provisions of the Russian and
|
||||
English versions of this Agreement, the **Russian version shall prevail**.
|
||||
>
|
||||
> Russian version is published at https://git.yourcmc.ru/vitalif/vitastor/src/branch/master/CLA-ru.md
|
||||
|
||||
This document represents the offer of Filippov Vitaliy Vladimirovich
|
||||
("Author"), author and copyright holder of Vitastor software ("Program"),
|
||||
acknowledged by a certificate of Federal Service for Intellectual
|
||||
Property of Russian Federation (Rospatent) # 2021617829 dated 20 May 2021,
|
||||
to "Contributors" to conclude this license agreement as follows
|
||||
("Agreement" or "Offer").
|
||||
|
||||
In accordance with Art. 435, Art. 438 of the Civil Code of the Russian
|
||||
Federation, this Agreement is an offer and in case of acceptance of the
|
||||
offer, an agreement is considered concluded on the conditions specified
|
||||
in the offer.
|
||||
|
||||
1. Applicable Terms. \
|
||||
1.1. "Official Repository" shall mean the computer storage, operated by
|
||||
the Author, containing all prior and future versions of the Source
|
||||
Code of the Program, at Internet addresses https://git.yourcmc.ru/vitalif/vitastor/
|
||||
or https://github.com/vitalif/vitastor/. \
|
||||
1.2. "Contributions" shall mean results of intellectual activity
|
||||
(including, but not limited to, source code, libraries, components,
|
||||
texts, documentation) which can be software or elements of the software
|
||||
and which are provided by Contributors to the Author for inclusion
|
||||
in the Program. \
|
||||
1.3. "Contributor" shall mean a person who provides Contributions to
|
||||
the Author and agrees with all provisions of this Agreement.
|
||||
A Сontributor can be: 1) an individual; or 2) a legal entity or an
|
||||
individual entrepreneur in case when an individual provides Contributions
|
||||
on behalf of third parties, including on behalf of his employer.
|
||||
|
||||
2. Subject of the Agreement. \
|
||||
2.1. Subject of the Agreement shall be the Contributions sent to the Author by Contributors. \
|
||||
2.2. The Contributor grants to the Author the right to use Contributions at his own
|
||||
discretion and without any necessity to get a prior approval from Contributor or
|
||||
any other third party in any way, under a simple (non-exclusive), royalty-free,
|
||||
irrevocable license throughout the world by all means not contrary to law, in whole
|
||||
or as a part of the Program, or other open-source or closed-source computer programs,
|
||||
products or services (hereinafter -- the "License"), including, but not limited to: \
|
||||
2.2.1. to execute Contributions and use them for any tasks; \
|
||||
2.2.2. to publish and distribute Contributions in modified or unmodified form and/or to rent them; \
|
||||
2.2.3. to modify Contributions, add comments, illustrations or any explanations to Contributions while using them; \
|
||||
2.2.4. to create other results of intellectual activity based on Contributions, including derivative works and composite works; \
|
||||
2.2.5. to translate Contributions into other languages, including other programming languages; \
|
||||
2.2.6. to carry out rental and public display of Contributions; \
|
||||
2.2.7. to use Contributions under the trade name and/or any trademark or any other label, or without it, as the Author thinks fit; \
|
||||
2.3. The Contributor grants to the Author the right to sublicense any of the aforementioned
|
||||
rights to third parties on any terms at the Author's discretion. \
|
||||
2.4. The License is provided for the entire duration of Contributor's
|
||||
exclusive intellectual property rights to the Contributions. \
|
||||
2.5. The Contributor grants to the Author the right to decide how and where to mention,
|
||||
or to not mention at all, the fact of his authorship, name, nickname and/or company
|
||||
details when including Contributions into the Program or in any other computer
|
||||
programs, products or services.
|
||||
|
||||
3. Acceptance of the Offer \
|
||||
3.1. The Contributor may provide Contributions to the Author in the form of
|
||||
a "Pull Request" in an Official Repository of the Program or by any
|
||||
other electronic means of communication, including, but not limited to,
|
||||
E-mail or messenger applications. \
|
||||
3.2. The acceptance of the Offer shall be the fact of provision of Contributions
|
||||
to the Author by the Contributor by any means with the following remark:
|
||||
“I accept Vitastor CLA agreement: https://git.yourcmc.ru/vitalif/vitastor/src/branch/master/CLA-en.md”
|
||||
or “Я принимаю соглашение Vitastor CLA: https://git.yourcmc.ru/vitalif/vitastor/src/branch/master/CLA-ru.md”. \
|
||||
3.3. Date of acceptance of the Offer shall be the date of such provision.
|
||||
|
||||
4. Rights and obligations of the parties. \
|
||||
4.1. The Contributor reserves the right to use Contributions by any lawful means
|
||||
not contrary to this Agreement. \
|
||||
4.2. The Author has the right to refuse to include Contributions into the Program
|
||||
at any moment with no explanation to the Contributor.
|
||||
|
||||
5. Representations and Warranties. \
|
||||
5.1. The person providing Contributions for the purpose of their inclusion
|
||||
in the Program represents and warrants that he is the Contributor
|
||||
or legally acts on the Contributor's behalf. Name or company details
|
||||
of the Contributor shall be provided with the Contribution at the moment
|
||||
of their provision to the Author. \
|
||||
5.2. The Contributor represents and warrants that he legally owns exclusive
|
||||
intellectual property rights to the Contributions. \
|
||||
5.3. The Contributor represents and warrants that any further use of
|
||||
Contributions by the Author as provided by Contributor under the terms
|
||||
of the Agreement does not infringe on intellectual and other rights and
|
||||
legitimate interests of third parties. \
|
||||
5.4. The Contributor represents and warrants that he has all rights and legal
|
||||
capacity needed to accept this Offer; \
|
||||
5.5. The Contributor represents and warrants that Contributions don't
|
||||
contain malware or any information considered illegal under the law
|
||||
of Russian Federation.
|
||||
|
||||
6. Termination of the Agreement \
|
||||
6.1. The Agreement may be terminated at will of both Author and Contributor,
|
||||
formalised in the written form or if the Agreement is terminated on
|
||||
reasons prescribed by the law of Russian Federation.
|
||||
|
||||
7. Final Clauses \
|
||||
7.1. The Contributor may optionally sign the Agreement in the written form. \
|
||||
7.2. The Agreement is deemed to become effective from the Date of signing of
|
||||
the Agreement and until the expiration of Contributor's exclusive
|
||||
intellectual property rights to the Contributions. \
|
||||
7.3. The Author may unilaterally alter the Agreement without informing Contributors.
|
||||
The new version of the document shall come into effect 3 (three) days after
|
||||
being published in the Official Repository of the Program at Internet address
|
||||
[https://git.yourcmc.ru/vitalif/vitastor/src/branch/master/CLA-en.md](https://git.yourcmc.ru/vitalif/vitastor/src/branch/master/CLA-en.md).
|
||||
Contributors should keep informed about the actual version of the Agreement themselves. \
|
||||
7.4. If the Author and the Contributor fail to agree on disputable issues,
|
||||
disputes shall be referred to the Moscow Arbitration court.
|
108
CLA-ru.md
Normal file
108
CLA-ru.md
Normal file
@@ -0,0 +1,108 @@
|
||||
## Лицензионное соглашение с участником
|
||||
|
||||
> Данная Оферта написана в Русской и Английской версиях. **Версия на английском
|
||||
языке предоставляется в информационных целях** и не связывает стороны договора.
|
||||
>
|
||||
> В случае несоответствий между положениями Русской и Английской версий Договора,
|
||||
**Русская версия имеет приоритет**.
|
||||
>
|
||||
> Английская версия опубликована по адресу https://git.yourcmc.ru/vitalif/vitastor/src/branch/master/CLA-en.md
|
||||
|
||||
Настоящий договор-оферта (далее по тексту – Оферта, Договор) адресована физическим
|
||||
и юридическим лицам (далее – Участникам) и является официальным публичным предложением
|
||||
Филиппова Виталия Владимировича (далее – Автора) программного обеспечения Vitastor,
|
||||
свидетельство Федеральной службы по интеллектуальной собственности (Роспатент) № 2021617829
|
||||
от 20 мая 2021 г. (далее – Программа) о нижеследующем:
|
||||
|
||||
1. Термины и определения \
|
||||
1.1. Репозиторий – электронное хранилище, содержащее исходный код Программы. \
|
||||
1.2. Доработка – результат интеллектуальной деятельности Участника, включающий
|
||||
в себя изменения или дополнения к исходному коду Программы, которые Участник
|
||||
желает включить в состав Программы для дальнейшего использования и распространения
|
||||
Автором и для этого направляет их Автору. \
|
||||
1.3. Участник – физическое или юридическое лицо, вносящее Доработки в код Программы. \
|
||||
1.4. ГК РФ – Гражданский кодекс Российской Федерации.
|
||||
|
||||
2. Предмет оферты \
|
||||
2.1. Предметом настоящей оферты являются Доработки, отправляемые Участником Автору. \
|
||||
2.2. Участник предоставляет Автору право использовать Доработки по собственному усмотрению
|
||||
и без необходимости предварительного согласования с Участником или иным третьим лицом
|
||||
на условиях простой (неисключительной) безвозмездной безотзывной лицензии, полностью
|
||||
или фрагментарно, в составе Программы или других программ, продуктов или сервисов
|
||||
как с открытым, так и с закрытым исходным кодом, любыми способами, не противоречащими
|
||||
закону, включая, но не ограничиваясь следующими: \
|
||||
2.2.1. Запускать и использовать Доработки для выполнения любых задач; \
|
||||
2.2.2. Распространять, импортировать и доводить Доработки до всеобщего сведения; \
|
||||
2.2.3. Вносить в Доработки изменения, сокращения и дополнения, снабжать Доработки
|
||||
при их использовании комментариями, иллюстрациями или пояснениями; \
|
||||
2.2.4. Создавать на основе Доработок иные результаты интеллектуальной деятельности,
|
||||
в том числе производные и составные произведения; \
|
||||
2.2.5. Переводить Доработки на другие языки, в том числе на другие языки программирования; \
|
||||
2.2.6. Осуществлять прокат и публичный показ Доработок; \
|
||||
2.2.7. Использовать Доработки под любым фирменным наименованием, товарным знаком
|
||||
(знаком обслуживания) или иным обозначением, или без такового. \
|
||||
2.3. Участник предоставляет Автору право сублицензировать полученные права на Доработки
|
||||
третьим лицам на любых условиях на усмотрение Автора. \
|
||||
2.4. Участник предоставляет Автору права на Доработки на территории всего мира. \
|
||||
2.5. Участник предоставляет Автору права на весь срок действия исключительного права
|
||||
Участника на Доработки. \
|
||||
2.6. Участник предоставляет Автору права на Доработки на безвозмездной основе. \
|
||||
2.7. Участник разрешает Автору самостоятельно определять порядок, способ и
|
||||
место указания его имени, реквизитов и/или псевдонима при включении
|
||||
Доработок в состав Программы или других программ, продуктов или сервисов.
|
||||
|
||||
3. Акцепт Оферты \
|
||||
3.1. Участник может передавать Доработки в адрес Автора через зеркала официального
|
||||
Репозитория Программы по адресам https://git.yourcmc.ru/vitalif/vitastor/ или
|
||||
https://github.com/vitalif/vitastor/ в виде “запроса на слияние” (pull request),
|
||||
либо в письменном виде или с помощью любых других электронных средств коммуникации,
|
||||
например, электронной почты или мессенджеров. \
|
||||
3.2. Факт передачи Участником Доработок в адрес Автора любым способом с одной из пометок
|
||||
“I accept Vitastor CLA agreement: https://git.yourcmc.ru/vitalif/vitastor/src/branch/master/CLA-en.md”
|
||||
или “Я принимаю соглашение Vitastor CLA: https://git.yourcmc.ru/vitalif/vitastor/src/branch/master/CLA-ru.md”
|
||||
является полным и безоговорочным акцептом (принятием) Участником условий настоящей
|
||||
Оферты, т.е. Участник считается ознакомившимся с настоящим публичным договором и
|
||||
в соответствии с ГК РФ признается лицом, вступившим с Автором в договорные отношения
|
||||
на основании настоящей Оферты. \
|
||||
3.3. Датой акцептирования настоящей Оферты считается дата такой передачи.
|
||||
|
||||
4. Права и обязанности Сторон \
|
||||
4.1. Участник сохраняет за собой право использовать Доработки любым законным
|
||||
способом, не противоречащим настоящему Договору. \
|
||||
4.2. Автор вправе отказать Участнику во включении Доработок в состав
|
||||
Программы без объяснения причин в любой момент по своему усмотрению.
|
||||
|
||||
5. Гарантии и заверения \
|
||||
5.1. Лицо, направляющее Доработки для целей их включения в состав Программы,
|
||||
гарантирует, что является Участником или представителем Участника. Имя или реквизиты
|
||||
Участника должны быть указаны при их передаче в адрес Автора Программы. \
|
||||
5.2. Участник гарантирует, что является законным обладателем исключительных прав
|
||||
на Доработки. \
|
||||
5.3. Участник гарантирует, что на момент акцептирования настоящей Оферты ему
|
||||
ничего не известно (и не могло быть известно) о правах третьих лиц на
|
||||
передаваемые Автору Доработки или их часть, которые могут быть нарушены
|
||||
в связи с передачей Доработок по настоящему Договору. \
|
||||
5.4. Участник гарантирует, что является дееспособным лицом и обладает всеми
|
||||
необходимыми правами для заключения Договора. \
|
||||
5.5. Участник гарантирует, что Доработки не содержат вредоносного ПО, а также
|
||||
любой другой информации, запрещённой к распространению по законам Российской
|
||||
Федерации.
|
||||
|
||||
6. Прекращение действия оферты \
|
||||
6.1. Действие настоящего договора может быть прекращено по соглашению сторон,
|
||||
оформленному в письменном виде, а также вследствие его расторжения по основаниям,
|
||||
предусмотренным законом.
|
||||
|
||||
7. Заключительные положения \
|
||||
7.1. Участник вправе по желанию подписать настоящий Договор в письменном виде. \
|
||||
7.2. Настоящий договор действует с момента его заключения и до истечения срока
|
||||
действия исключительных прав Участника на Доработки. \
|
||||
7.3. Автор имеет право в одностороннем порядке вносить изменения и дополнения в договор
|
||||
без специального уведомления об этом Участников. Новая редакция документа вступает
|
||||
в силу через 3 (Три) календарных дня со дня опубликования в официальном Репозитории
|
||||
Программы по адресу в сети Интернет
|
||||
[https://git.yourcmc.ru/vitalif/vitastor/src/branch/master/CLA-ru.md](https://git.yourcmc.ru/vitalif/vitastor/src/branch/master/CLA-ru.md).
|
||||
Участники самостоятельно отслеживают действующие условия Оферты. \
|
||||
7.4. Все споры, возникающие между сторонами в процессе их взаимодействия по настоящему
|
||||
договору, решаются путём переговоров. В случае невозможности урегулирования споров
|
||||
переговорным порядком стороны разрешают их в Арбитражном суде г.Москвы.
|
@@ -2,6 +2,6 @@ cmake_minimum_required(VERSION 2.8.12)
|
||||
|
||||
project(vitastor)
|
||||
|
||||
set(VERSION "1.3.1")
|
||||
set(VERSION "1.4.8")
|
||||
|
||||
add_subdirectory(src)
|
||||
|
Submodule cpp-btree updated: 45e6d1f131...8de8b467ac
@@ -1,4 +1,4 @@
|
||||
VERSION ?= v1.3.1
|
||||
VERSION ?= v1.4.8
|
||||
|
||||
all: build push
|
||||
|
||||
|
@@ -49,7 +49,7 @@ spec:
|
||||
capabilities:
|
||||
add: ["SYS_ADMIN"]
|
||||
allowPrivilegeEscalation: true
|
||||
image: vitalif/vitastor-csi:v1.3.1
|
||||
image: vitalif/vitastor-csi:v1.4.8
|
||||
args:
|
||||
- "--node=$(NODE_ID)"
|
||||
- "--endpoint=$(CSI_ENDPOINT)"
|
||||
|
@@ -121,7 +121,7 @@ spec:
|
||||
privileged: true
|
||||
capabilities:
|
||||
add: ["SYS_ADMIN"]
|
||||
image: vitalif/vitastor-csi:v1.3.1
|
||||
image: vitalif/vitastor-csi:v1.4.8
|
||||
args:
|
||||
- "--node=$(NODE_ID)"
|
||||
- "--endpoint=$(CSI_ENDPOINT)"
|
||||
|
@@ -5,7 +5,7 @@ package vitastor
|
||||
|
||||
const (
|
||||
vitastorCSIDriverName = "csi.vitastor.io"
|
||||
vitastorCSIDriverVersion = "1.3.1"
|
||||
vitastorCSIDriverVersion = "1.4.8"
|
||||
)
|
||||
|
||||
// Config struct fills the parameters of request or user input
|
||||
|
@@ -14,6 +14,7 @@ import (
|
||||
"strconv"
|
||||
"strings"
|
||||
"syscall"
|
||||
"time"
|
||||
|
||||
"google.golang.org/grpc/codes"
|
||||
"google.golang.org/grpc/status"
|
||||
@@ -32,6 +33,7 @@ type NodeServer struct
|
||||
useVduse bool
|
||||
stateDir string
|
||||
mounter mount.Interface
|
||||
restartInterval time.Duration
|
||||
}
|
||||
|
||||
type DeviceState struct
|
||||
@@ -65,6 +67,16 @@ func NewNodeServer(driver *Driver) *NodeServer
|
||||
if (ns.useVduse)
|
||||
{
|
||||
ns.restoreVduseDaemons()
|
||||
dur, err := time.ParseDuration(os.Getenv("RESTART_INTERVAL"))
|
||||
if (err != nil)
|
||||
{
|
||||
dur = 10 * time.Second
|
||||
}
|
||||
ns.restartInterval = dur
|
||||
if (ns.restartInterval != time.Duration(0))
|
||||
{
|
||||
go ns.restarter()
|
||||
}
|
||||
}
|
||||
return ns
|
||||
}
|
||||
@@ -176,7 +188,6 @@ func (ns *NodeServer) unmapNbd(devicePath string)
|
||||
|
||||
func findByPidFile(pidFile string) (*os.Process, error)
|
||||
{
|
||||
klog.Infof("killing process with PID from file %s", pidFile)
|
||||
pidBuf, err := os.ReadFile(pidFile)
|
||||
if (err != nil)
|
||||
{
|
||||
@@ -197,6 +208,7 @@ func findByPidFile(pidFile string) (*os.Process, error)
|
||||
|
||||
func killByPidFile(pidFile string) error
|
||||
{
|
||||
klog.Infof("killing process with PID from file %s", pidFile)
|
||||
proc, err := findByPidFile(pidFile)
|
||||
if (err != nil)
|
||||
{
|
||||
@@ -364,6 +376,21 @@ func (ns *NodeServer) unmapVduseById(vdpaId string)
|
||||
}
|
||||
}
|
||||
|
||||
func (ns *NodeServer) restarter()
|
||||
{
|
||||
// Restart dead VDUSE daemons at regular intervals
|
||||
// Otherwise volume I/O may hang in case of a qemu-storage-daemon crash
|
||||
// Moreover, it may lead to a kernel panic of the kernel is configured to
|
||||
// panic on hung tasks
|
||||
ticker := time.NewTicker(ns.restartInterval)
|
||||
defer ticker.Stop()
|
||||
for
|
||||
{
|
||||
<-ticker.C
|
||||
ns.restoreVduseDaemons()
|
||||
}
|
||||
}
|
||||
|
||||
func (ns *NodeServer) restoreVduseDaemons()
|
||||
{
|
||||
pattern := ns.stateDir+"vitastor-vduse-*.json"
|
||||
|
2
debian/build-vitastor-bookworm.sh
vendored
2
debian/build-vitastor-bookworm.sh
vendored
@@ -3,5 +3,5 @@
|
||||
cat < vitastor.Dockerfile > ../Dockerfile
|
||||
cd ..
|
||||
mkdir -p packages
|
||||
sudo podman build --build-arg REL=bookworm -v `pwd`/packages:/root/packages -f Dockerfile .
|
||||
sudo podman build --build-arg DISTRO=debian --build-arg REL=bookworm -v `pwd`/packages:/root/packages -f Dockerfile .
|
||||
rm Dockerfile
|
||||
|
2
debian/build-vitastor-bullseye.sh
vendored
2
debian/build-vitastor-bullseye.sh
vendored
@@ -3,5 +3,5 @@
|
||||
cat < vitastor.Dockerfile > ../Dockerfile
|
||||
cd ..
|
||||
mkdir -p packages
|
||||
sudo podman build --build-arg REL=bullseye -v `pwd`/packages:/root/packages -f Dockerfile .
|
||||
sudo podman build --build-arg DISTRO=debian --build-arg REL=bullseye -v `pwd`/packages:/root/packages -f Dockerfile .
|
||||
rm Dockerfile
|
||||
|
2
debian/build-vitastor-buster.sh
vendored
2
debian/build-vitastor-buster.sh
vendored
@@ -3,5 +3,5 @@
|
||||
cat < vitastor.Dockerfile > ../Dockerfile
|
||||
cd ..
|
||||
mkdir -p packages
|
||||
sudo podman build --build-arg REL=buster -v `pwd`/packages:/root/packages -f Dockerfile .
|
||||
sudo podman build --build-arg DISTRO=debian --build-arg REL=buster -v `pwd`/packages:/root/packages -f Dockerfile .
|
||||
rm Dockerfile
|
||||
|
7
debian/build-vitastor-ubuntu-jammy.sh
vendored
Executable file
7
debian/build-vitastor-ubuntu-jammy.sh
vendored
Executable file
@@ -0,0 +1,7 @@
|
||||
#!/bin/bash
|
||||
|
||||
cat < vitastor.Dockerfile > ../Dockerfile
|
||||
cd ..
|
||||
mkdir -p packages
|
||||
sudo podman build --build-arg DISTRO=ubuntu --build-arg REL=jammy -v `pwd`/packages:/root/packages -f Dockerfile .
|
||||
rm Dockerfile
|
2
debian/changelog
vendored
2
debian/changelog
vendored
@@ -1,4 +1,4 @@
|
||||
vitastor (1.3.1-1) unstable; urgency=medium
|
||||
vitastor (1.4.8-1) unstable; urgency=medium
|
||||
|
||||
* Bugfixes
|
||||
|
||||
|
9
debian/libvirt.Dockerfile
vendored
9
debian/libvirt.Dockerfile
vendored
@@ -1,13 +1,14 @@
|
||||
# Build patched libvirt for Debian Buster or Bullseye/Sid inside a container
|
||||
# cd ..; podman build --build-arg REL=bullseye -v `pwd`/packages:/root/packages -f debian/libvirt.Dockerfile .
|
||||
# cd ..; podman build --build-arg DISTRO=debian --build-arg REL=bullseye -v `pwd`/packages:/root/packages -f debian/libvirt.Dockerfile .
|
||||
|
||||
ARG DISTRO=
|
||||
ARG REL=
|
||||
FROM debian:$REL
|
||||
FROM $DISTRO:$REL
|
||||
ARG REL=
|
||||
|
||||
WORKDIR /root
|
||||
|
||||
RUN if [ "$REL" = "buster" -o "$REL" = "bullseye" ]; then \
|
||||
RUN if ([ "${DISTRO}" = "debian" ]) && ( [ "${REL}" = "buster" -o "${REL}" = "bullseye" ] ); then \
|
||||
echo "deb http://deb.debian.org/debian $REL-backports main" >> /etc/apt/sources.list; \
|
||||
echo >> /etc/apt/preferences; \
|
||||
echo 'Package: *' >> /etc/apt/preferences; \
|
||||
@@ -23,7 +24,7 @@ RUN apt-get -y build-dep libvirt0
|
||||
RUN apt-get -y install libglusterfs-dev
|
||||
RUN apt-get --download-only source libvirt
|
||||
|
||||
ADD patches/libvirt-5.0-vitastor.diff patches/libvirt-7.0-vitastor.diff patches/libvirt-7.5-vitastor.diff patches/libvirt-7.6-vitastor.diff /root
|
||||
ADD patches/libvirt-5.0-vitastor.diff patches/libvirt-7.0-vitastor.diff patches/libvirt-7.5-vitastor.diff patches/libvirt-7.6-vitastor.diff patches/libvirt-8.0-vitastor.diff /root
|
||||
RUN set -e; \
|
||||
mkdir -p /root/packages/libvirt-$REL; \
|
||||
rm -rf /root/packages/libvirt-$REL/*; \
|
||||
|
14
debian/vitastor.Dockerfile
vendored
14
debian/vitastor.Dockerfile
vendored
@@ -1,8 +1,10 @@
|
||||
# Build Vitastor packages for Debian inside a container
|
||||
# cd ..; podman build --build-arg REL=bullseye -v `pwd`/packages:/root/packages -f debian/vitastor.Dockerfile .
|
||||
# cd ..; podman build --build-arg DISTRO=debian --build-arg REL=bullseye -v `pwd`/packages:/root/packages -f debian/vitastor.Dockerfile .
|
||||
|
||||
ARG DISTRO=debian
|
||||
ARG REL=
|
||||
FROM debian:$REL
|
||||
FROM $DISTRO:$REL
|
||||
ARG DISTRO=debian
|
||||
ARG REL=
|
||||
|
||||
WORKDIR /root
|
||||
@@ -35,8 +37,8 @@ RUN set -e -x; \
|
||||
mkdir -p /root/packages/vitastor-$REL; \
|
||||
rm -rf /root/packages/vitastor-$REL/*; \
|
||||
cd /root/packages/vitastor-$REL; \
|
||||
cp -r /root/vitastor vitastor-1.3.1; \
|
||||
cd vitastor-1.3.1; \
|
||||
cp -r /root/vitastor vitastor-1.4.8; \
|
||||
cd vitastor-1.4.8; \
|
||||
ln -s /root/fio-build/fio-*/ ./fio; \
|
||||
FIO=$(head -n1 fio/debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
|
||||
ls /usr/include/linux/raw.h || cp ./debian/raw.h /usr/include/linux/raw.h; \
|
||||
@@ -49,8 +51,8 @@ RUN set -e -x; \
|
||||
rm -rf a b; \
|
||||
echo "dep:fio=$FIO" > debian/fio_version; \
|
||||
cd /root/packages/vitastor-$REL; \
|
||||
tar --sort=name --mtime='2020-01-01' --owner=0 --group=0 --exclude=debian -cJf vitastor_1.3.1.orig.tar.xz vitastor-1.3.1; \
|
||||
cd vitastor-1.3.1; \
|
||||
tar --sort=name --mtime='2020-01-01' --owner=0 --group=0 --exclude=debian -cJf vitastor_1.4.8.orig.tar.xz vitastor-1.4.8; \
|
||||
cd vitastor-1.4.8; \
|
||||
V=$(head -n1 debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
|
||||
DEBFULLNAME="Vitaliy Filippov <vitalif@yourcmc.ru>" dch -D $REL -v "$V""$REL" "Rebuild for $REL"; \
|
||||
DEB_BUILD_OPTIONS=nocheck dpkg-buildpackage --jobs=auto -sa; \
|
||||
|
@@ -6,9 +6,11 @@
|
||||
|
||||
# Client Parameters
|
||||
|
||||
These parameters apply only to clients and affect their interaction with
|
||||
the cluster.
|
||||
These parameters apply only to Vitastor clients (QEMU, fio, NBD and so on) and
|
||||
affect their interaction with the cluster.
|
||||
|
||||
- [client_retry_interval](#client_retry_interval)
|
||||
- [client_eio_retry_interval](#client_eio_retry_interval)
|
||||
- [client_max_dirty_bytes](#client_max_dirty_bytes)
|
||||
- [client_max_dirty_ops](#client_max_dirty_ops)
|
||||
- [client_enable_writeback](#client_enable_writeback)
|
||||
@@ -19,6 +21,26 @@ the cluster.
|
||||
- [nbd_max_devices](#nbd_max_devices)
|
||||
- [nbd_max_part](#nbd_max_part)
|
||||
|
||||
## client_retry_interval
|
||||
|
||||
- Type: milliseconds
|
||||
- Default: 50
|
||||
- Minimum: 10
|
||||
- Can be changed online: yes
|
||||
|
||||
Retry time for I/O requests failed due to inactive PGs or network
|
||||
connectivity errors.
|
||||
|
||||
## client_eio_retry_interval
|
||||
|
||||
- Type: milliseconds
|
||||
- Default: 1000
|
||||
- Can be changed online: yes
|
||||
|
||||
Retry time for I/O requests failed due to data corruption or unfinished
|
||||
EC object deletions (has_incomplete PG state). 0 disables such retries
|
||||
and clients are not blocked and just get EIO error code instead.
|
||||
|
||||
## client_max_dirty_bytes
|
||||
|
||||
- Type: integer
|
||||
|
@@ -6,9 +6,11 @@
|
||||
|
||||
# Параметры клиентского кода
|
||||
|
||||
Данные параметры применяются только к клиентам Vitastor (QEMU, fio, NBD) и
|
||||
Данные параметры применяются только к клиентам Vitastor (QEMU, fio, NBD и т.п.) и
|
||||
затрагивают логику их работы с кластером.
|
||||
|
||||
- [client_retry_interval](#client_retry_interval)
|
||||
- [client_eio_retry_interval](#client_eio_retry_interval)
|
||||
- [client_max_dirty_bytes](#client_max_dirty_bytes)
|
||||
- [client_max_dirty_ops](#client_max_dirty_ops)
|
||||
- [client_enable_writeback](#client_enable_writeback)
|
||||
@@ -19,6 +21,27 @@
|
||||
- [nbd_max_devices](#nbd_max_devices)
|
||||
- [nbd_max_part](#nbd_max_part)
|
||||
|
||||
## client_retry_interval
|
||||
|
||||
- Тип: миллисекунды
|
||||
- Значение по умолчанию: 50
|
||||
- Минимальное значение: 10
|
||||
- Можно менять на лету: да
|
||||
|
||||
Время повтора запросов ввода-вывода, неудачных из-за неактивных PG или
|
||||
ошибок сети.
|
||||
|
||||
## client_eio_retry_interval
|
||||
|
||||
- Тип: миллисекунды
|
||||
- Значение по умолчанию: 1000
|
||||
- Можно менять на лету: да
|
||||
|
||||
Время повтора запросов ввода-вывода, неудачных из-за повреждения данных
|
||||
или незавершённых удалений EC-объектов (состояния PG has_incomplete).
|
||||
0 отключает повторы таких запросов и клиенты не блокируются, а вместо
|
||||
этого просто получают код ошибки EIO.
|
||||
|
||||
## client_max_dirty_bytes
|
||||
|
||||
- Тип: целое число
|
||||
|
@@ -19,8 +19,8 @@ These parameters only apply to Monitors.
|
||||
## etcd_mon_ttl
|
||||
|
||||
- Type: seconds
|
||||
- Default: 30
|
||||
- Minimum: 10
|
||||
- Default: 1
|
||||
- Minimum: 5
|
||||
|
||||
Monitor etcd lease refresh interval in seconds
|
||||
|
||||
|
@@ -19,8 +19,8 @@
|
||||
## etcd_mon_ttl
|
||||
|
||||
- Тип: секунды
|
||||
- Значение по умолчанию: 30
|
||||
- Минимальное значение: 10
|
||||
- Значение по умолчанию: 1
|
||||
- Минимальное значение: 5
|
||||
|
||||
Интервал обновления etcd резервации (lease) монитором
|
||||
|
||||
|
@@ -25,7 +25,6 @@ between clients, OSDs and etcd.
|
||||
- [peer_connect_timeout](#peer_connect_timeout)
|
||||
- [osd_idle_timeout](#osd_idle_timeout)
|
||||
- [osd_ping_timeout](#osd_ping_timeout)
|
||||
- [up_wait_retry_interval](#up_wait_retry_interval)
|
||||
- [max_etcd_attempts](#max_etcd_attempts)
|
||||
- [etcd_quick_timeout](#etcd_quick_timeout)
|
||||
- [etcd_slow_timeout](#etcd_slow_timeout)
|
||||
@@ -212,17 +211,6 @@ Maximum time to wait for OSD keepalive responses. If an OSD doesn't respond
|
||||
within this time, the connection to it is dropped and a reconnection attempt
|
||||
is scheduled.
|
||||
|
||||
## up_wait_retry_interval
|
||||
|
||||
- Type: milliseconds
|
||||
- Default: 500
|
||||
- Minimum: 50
|
||||
- Can be changed online: yes
|
||||
|
||||
OSDs respond to clients with a special error code when they receive I/O
|
||||
requests for a PG that's not synchronized and started. This parameter sets
|
||||
the time for the clients to wait before re-attempting such I/O requests.
|
||||
|
||||
## max_etcd_attempts
|
||||
|
||||
- Type: integer
|
||||
|
@@ -25,7 +25,6 @@
|
||||
- [peer_connect_timeout](#peer_connect_timeout)
|
||||
- [osd_idle_timeout](#osd_idle_timeout)
|
||||
- [osd_ping_timeout](#osd_ping_timeout)
|
||||
- [up_wait_retry_interval](#up_wait_retry_interval)
|
||||
- [max_etcd_attempts](#max_etcd_attempts)
|
||||
- [etcd_quick_timeout](#etcd_quick_timeout)
|
||||
- [etcd_slow_timeout](#etcd_slow_timeout)
|
||||
@@ -221,19 +220,6 @@ OSD в любом случае согласовывают реальное зн
|
||||
Если OSD не отвечает за это время, соединение отключается и производится
|
||||
повторная попытка соединения.
|
||||
|
||||
## up_wait_retry_interval
|
||||
|
||||
- Тип: миллисекунды
|
||||
- Значение по умолчанию: 500
|
||||
- Минимальное значение: 50
|
||||
- Можно менять на лету: да
|
||||
|
||||
Когда OSD получают от клиентов запросы ввода-вывода, относящиеся к не
|
||||
поднятым на данный момент на них PG, либо к PG в процессе синхронизации,
|
||||
они отвечают клиентам специальным кодом ошибки, означающим, что клиент
|
||||
должен некоторое время подождать перед повторением запроса. Именно это время
|
||||
ожидания задаёт данный параметр.
|
||||
|
||||
## max_etcd_attempts
|
||||
|
||||
- Тип: целое число
|
||||
|
@@ -19,6 +19,7 @@ them, even without restarting by updating configuration in etcd.
|
||||
- [autosync_interval](#autosync_interval)
|
||||
- [autosync_writes](#autosync_writes)
|
||||
- [recovery_queue_depth](#recovery_queue_depth)
|
||||
- [recovery_sleep_us](#recovery_sleep_us)
|
||||
- [recovery_pg_switch](#recovery_pg_switch)
|
||||
- [recovery_sync_batch](#recovery_sync_batch)
|
||||
- [readonly](#readonly)
|
||||
@@ -51,6 +52,14 @@ them, even without restarting by updating configuration in etcd.
|
||||
- [scrub_list_limit](#scrub_list_limit)
|
||||
- [scrub_find_best](#scrub_find_best)
|
||||
- [scrub_ec_max_bruteforce](#scrub_ec_max_bruteforce)
|
||||
- [recovery_tune_interval](#recovery_tune_interval)
|
||||
- [recovery_tune_util_low](#recovery_tune_util_low)
|
||||
- [recovery_tune_util_high](#recovery_tune_util_high)
|
||||
- [recovery_tune_client_util_low](#recovery_tune_client_util_low)
|
||||
- [recovery_tune_client_util_high](#recovery_tune_client_util_high)
|
||||
- [recovery_tune_agg_interval](#recovery_tune_agg_interval)
|
||||
- [recovery_tune_sleep_min_us](#recovery_tune_sleep_min_us)
|
||||
- [recovery_tune_sleep_cutoff_us](#recovery_tune_sleep_cutoff_us)
|
||||
|
||||
## etcd_report_interval
|
||||
|
||||
@@ -135,12 +144,24 @@ operations before issuing an fsync operation internally.
|
||||
## recovery_queue_depth
|
||||
|
||||
- Type: integer
|
||||
- Default: 4
|
||||
- Default: 1
|
||||
- Can be changed online: yes
|
||||
|
||||
Maximum recovery operations per one primary OSD at any given moment of time.
|
||||
Currently it's the only parameter available to tune the speed or recovery
|
||||
and rebalancing, but it's planned to implement more.
|
||||
Maximum recovery and rebalance operations initiated by each OSD in parallel.
|
||||
Note that each OSD talks to a lot of other OSDs so actual number of parallel
|
||||
recovery operations per each OSD is greater than just recovery_queue_depth.
|
||||
Increasing this parameter can speedup recovery if [auto-tuning](#recovery_tune_interval)
|
||||
allows it or if it is disabled.
|
||||
|
||||
## recovery_sleep_us
|
||||
|
||||
- Type: microseconds
|
||||
- Default: 0
|
||||
- Can be changed online: yes
|
||||
|
||||
Delay for all recovery- and rebalance- related operations. If non-zero,
|
||||
such operations are artificially slowed down to reduce the impact on
|
||||
client I/O.
|
||||
|
||||
## recovery_pg_switch
|
||||
|
||||
@@ -508,3 +529,90 @@ the variant with most available equal copies is correct. For example, if
|
||||
you have 3 replicas and 1 of them differs, this one is considered to be
|
||||
corrupted. But if there is no "best" version with more copies than all
|
||||
others have then the object is also marked as inconsistent.
|
||||
|
||||
## recovery_tune_interval
|
||||
|
||||
- Type: seconds
|
||||
- Default: 1
|
||||
- Can be changed online: yes
|
||||
|
||||
Interval at which OSD re-considers client and recovery load and automatically
|
||||
adjusts [recovery_sleep_us](#recovery_sleep_us). Recovery auto-tuning is
|
||||
disabled if recovery_tune_interval is set to 0.
|
||||
|
||||
Auto-tuning targets utilization. Utilization is a measure of load and is
|
||||
equal to the product of iops and average latency (so it may be greater
|
||||
than 1). You set "low" and "high" client utilization thresholds and two
|
||||
corresponding target recovery utilization levels. OSD calculates desired
|
||||
recovery utilization from client utilization using linear interpolation
|
||||
and auto-tunes recovery operation delay to make actual recovery utilization
|
||||
match desired.
|
||||
|
||||
This allows to reduce recovery/rebalance impact on client operations. It is
|
||||
of course impossible to remove it completely, but it should become adequate.
|
||||
In some tests rebalance could earlier drop client write speed from 1.5 GB/s
|
||||
to 50-100 MB/s, with default auto-tuning settings it now only reduces
|
||||
to ~1 GB/s.
|
||||
|
||||
## recovery_tune_util_low
|
||||
|
||||
- Type: number
|
||||
- Default: 0.1
|
||||
- Can be changed online: yes
|
||||
|
||||
Desired recovery/rebalance utilization when client load is high, i.e. when
|
||||
it is at or above recovery_tune_client_util_high.
|
||||
|
||||
## recovery_tune_util_high
|
||||
|
||||
- Type: number
|
||||
- Default: 1
|
||||
- Can be changed online: yes
|
||||
|
||||
Desired recovery/rebalance utilization when client load is low, i.e. when
|
||||
it is at or below recovery_tune_client_util_low.
|
||||
|
||||
## recovery_tune_client_util_low
|
||||
|
||||
- Type: number
|
||||
- Default: 0
|
||||
- Can be changed online: yes
|
||||
|
||||
Client utilization considered "low".
|
||||
|
||||
## recovery_tune_client_util_high
|
||||
|
||||
- Type: number
|
||||
- Default: 0.5
|
||||
- Can be changed online: yes
|
||||
|
||||
Client utilization considered "high".
|
||||
|
||||
## recovery_tune_agg_interval
|
||||
|
||||
- Type: integer
|
||||
- Default: 10
|
||||
- Can be changed online: yes
|
||||
|
||||
The number of last auto-tuning iterations to use for calculating the
|
||||
delay as average. Lower values result in quicker response to client
|
||||
load change, higher values result in more stable delay. Default value of 10
|
||||
is usually fine.
|
||||
|
||||
## recovery_tune_sleep_min_us
|
||||
|
||||
- Type: microseconds
|
||||
- Default: 10
|
||||
- Can be changed online: yes
|
||||
|
||||
Minimum possible value for auto-tuned recovery_sleep_us. Lower values
|
||||
are changed to 0.
|
||||
|
||||
## recovery_tune_sleep_cutoff_us
|
||||
|
||||
- Type: microseconds
|
||||
- Default: 10000000
|
||||
- Can be changed online: yes
|
||||
|
||||
Maximum possible value for auto-tuned recovery_sleep_us. Higher values
|
||||
are treated as outliers and ignored in aggregation.
|
||||
|
@@ -20,6 +20,7 @@
|
||||
- [autosync_interval](#autosync_interval)
|
||||
- [autosync_writes](#autosync_writes)
|
||||
- [recovery_queue_depth](#recovery_queue_depth)
|
||||
- [recovery_sleep_us](#recovery_sleep_us)
|
||||
- [recovery_pg_switch](#recovery_pg_switch)
|
||||
- [recovery_sync_batch](#recovery_sync_batch)
|
||||
- [readonly](#readonly)
|
||||
@@ -52,6 +53,14 @@
|
||||
- [scrub_list_limit](#scrub_list_limit)
|
||||
- [scrub_find_best](#scrub_find_best)
|
||||
- [scrub_ec_max_bruteforce](#scrub_ec_max_bruteforce)
|
||||
- [recovery_tune_interval](#recovery_tune_interval)
|
||||
- [recovery_tune_util_low](#recovery_tune_util_low)
|
||||
- [recovery_tune_util_high](#recovery_tune_util_high)
|
||||
- [recovery_tune_client_util_low](#recovery_tune_client_util_low)
|
||||
- [recovery_tune_client_util_high](#recovery_tune_client_util_high)
|
||||
- [recovery_tune_agg_interval](#recovery_tune_agg_interval)
|
||||
- [recovery_tune_sleep_min_us](#recovery_tune_sleep_min_us)
|
||||
- [recovery_tune_sleep_cutoff_us](#recovery_tune_sleep_cutoff_us)
|
||||
|
||||
## etcd_report_interval
|
||||
|
||||
@@ -138,13 +147,25 @@ OSD, чтобы успевать очищать журнал - без них OSD
|
||||
## recovery_queue_depth
|
||||
|
||||
- Тип: целое число
|
||||
- Значение по умолчанию: 4
|
||||
- Значение по умолчанию: 1
|
||||
- Можно менять на лету: да
|
||||
|
||||
Максимальное число операций восстановления на одном первичном OSD в любой
|
||||
момент времени. На данный момент единственный параметр, который можно менять
|
||||
для ускорения или замедления восстановления и перебалансировки данных, но
|
||||
в планах реализация других параметров.
|
||||
Максимальное число параллельных операций восстановления, инициируемых одним
|
||||
OSD в любой момент времени. Имейте в виду, что каждый OSD обычно работает с
|
||||
многими другими OSD, так что на практике параллелизм восстановления больше,
|
||||
чем просто recovery_queue_depth. Увеличение значения этого параметра может
|
||||
ускорить восстановление если [автотюнинг скорости](#recovery_tune_interval)
|
||||
разрешает это или если он отключён.
|
||||
|
||||
## recovery_sleep_us
|
||||
|
||||
- Тип: микросекунды
|
||||
- Значение по умолчанию: 0
|
||||
- Можно менять на лету: да
|
||||
|
||||
Delay for all recovery- and rebalance- related operations. If non-zero,
|
||||
such operations are artificially slowed down to reduce the impact on
|
||||
client I/O.
|
||||
|
||||
## recovery_pg_switch
|
||||
|
||||
@@ -535,3 +556,93 @@ EC (кодов коррекции ошибок) с более, чем 1 диск
|
||||
считается некорректной. Однако, если "лучшую" версию с числом доступных
|
||||
копий большим, чем у всех других версий, найти невозможно, то объект тоже
|
||||
маркируется неконсистентным.
|
||||
|
||||
## recovery_tune_interval
|
||||
|
||||
- Тип: секунды
|
||||
- Значение по умолчанию: 1
|
||||
- Можно менять на лету: да
|
||||
|
||||
Интервал, с которым OSD пересматривает клиентскую нагрузку и нагрузку
|
||||
восстановления и автоматически подстраивает [recovery_sleep_us](#recovery_sleep_us).
|
||||
Автотюнинг (автоподстройка) отключается, если recovery_tune_interval
|
||||
устанавливается в значение 0.
|
||||
|
||||
Автотюнинг регулирует утилизацию. Утилизация является мерой нагрузки
|
||||
и равна произведению числа операций в секунду и средней задержки
|
||||
(то есть, она может быть выше 1). Вы задаёте два уровня клиентской
|
||||
утилизации - "низкий" и "высокий" (low и high) и два соответствующих
|
||||
целевых уровня утилизации операциями восстановления. OSD рассчитывает
|
||||
желаемый уровень утилизации восстановления линейной интерполяцией от
|
||||
клиентской утилизации и подстраивает задержку операций восстановления
|
||||
так, чтобы фактическая утилизация восстановления совпадала с желаемой.
|
||||
|
||||
Это позволяет снизить влияние восстановления и ребаланса на клиентские
|
||||
операции. Конечно, невозможно исключить такое влияние полностью, но оно
|
||||
должно становиться адекватнее. В некоторых тестах перебалансировка могла
|
||||
снижать клиентскую скорость записи с 1.5 ГБ/с до 50-100 МБ/с, а теперь, с
|
||||
настройками автотюнинга по умолчанию, она снижается только до ~1 ГБ/с.
|
||||
|
||||
## recovery_tune_util_low
|
||||
|
||||
- Тип: число
|
||||
- Значение по умолчанию: 0.1
|
||||
- Можно менять на лету: да
|
||||
|
||||
Желаемая утилизация восстановления в моменты, когда клиентская нагрузка
|
||||
высокая, то есть, находится на уровне или выше recovery_tune_client_util_high.
|
||||
|
||||
## recovery_tune_util_high
|
||||
|
||||
- Тип: число
|
||||
- Значение по умолчанию: 1
|
||||
- Можно менять на лету: да
|
||||
|
||||
Желаемая утилизация восстановления в моменты, когда клиентская нагрузка
|
||||
низкая, то есть, находится на уровне или ниже recovery_tune_client_util_low.
|
||||
|
||||
## recovery_tune_client_util_low
|
||||
|
||||
- Тип: число
|
||||
- Значение по умолчанию: 0
|
||||
- Можно менять на лету: да
|
||||
|
||||
Клиентская утилизация, которая считается "низкой".
|
||||
|
||||
## recovery_tune_client_util_high
|
||||
|
||||
- Тип: число
|
||||
- Значение по умолчанию: 0.5
|
||||
- Можно менять на лету: да
|
||||
|
||||
Клиентская утилизация, которая считается "высокой".
|
||||
|
||||
## recovery_tune_agg_interval
|
||||
|
||||
- Тип: целое число
|
||||
- Значение по умолчанию: 10
|
||||
- Можно менять на лету: да
|
||||
|
||||
Число последних итераций автоподстройки для расчёта задержки как среднего
|
||||
значения. Меньшие значения параметра ускоряют отклик на изменение нагрузки,
|
||||
большие значения делают задержку стабильнее. Значение по умолчанию 10
|
||||
обычно нормальное и не требует изменений.
|
||||
|
||||
## recovery_tune_sleep_min_us
|
||||
|
||||
- Тип: микросекунды
|
||||
- Значение по умолчанию: 10
|
||||
- Можно менять на лету: да
|
||||
|
||||
Минимальное возможное значение авто-подстроенного recovery_sleep_us.
|
||||
Меньшие значения заменяются на 0.
|
||||
|
||||
## recovery_tune_sleep_cutoff_us
|
||||
|
||||
- Тип: микросекунды
|
||||
- Значение по умолчанию: 10000000
|
||||
- Можно менять на лету: да
|
||||
|
||||
Максимальное возможное значение авто-подстроенного recovery_sleep_us.
|
||||
Большие значения считаются случайными выбросами и игнорируются в
|
||||
усреднении.
|
||||
|
@@ -154,6 +154,9 @@ That is, if it becomes impossible to place PG data on at least (pg_minsize)
|
||||
OSDs, PG is deactivated for both read and write. So you know that a fresh
|
||||
write always goes to at least (pg_minsize) OSDs (disks).
|
||||
|
||||
That is, pg_size minus pg_minsize sets the number of disk failures to tolerate
|
||||
without temporary downtime (for [osd_out_time](monitor.en.md#osd_out_time)).
|
||||
|
||||
FIXME: pg_minsize behaviour may be changed in the future to only make PGs
|
||||
read-only instead of deactivating them.
|
||||
|
||||
|
@@ -157,6 +157,10 @@
|
||||
OSD, PG деактивируется на чтение и запись. Иными словами, всегда известно,
|
||||
что новые блоки данных всегда записываются как минимум на pg_minsize дисков.
|
||||
|
||||
По сути, разница pg_size и pg_minsize задаёт число отказов дисков, которые пул
|
||||
может пережить без временной (на [osd_out_time](monitor.ru.md#osd_out_time))
|
||||
остановки обслуживания.
|
||||
|
||||
FIXME: Поведение pg_minsize может быть изменено в будущем с полной деактивации
|
||||
PG на перевод их в режим только для чтения.
|
||||
|
||||
|
@@ -1,3 +1,27 @@
|
||||
- name: client_retry_interval
|
||||
type: ms
|
||||
min: 10
|
||||
default: 50
|
||||
online: true
|
||||
info: |
|
||||
Retry time for I/O requests failed due to inactive PGs or network
|
||||
connectivity errors.
|
||||
info_ru: |
|
||||
Время повтора запросов ввода-вывода, неудачных из-за неактивных PG или
|
||||
ошибок сети.
|
||||
- name: client_eio_retry_interval
|
||||
type: ms
|
||||
default: 1000
|
||||
online: true
|
||||
info: |
|
||||
Retry time for I/O requests failed due to data corruption or unfinished
|
||||
EC object deletions (has_incomplete PG state). 0 disables such retries
|
||||
and clients are not blocked and just get EIO error code instead.
|
||||
info_ru: |
|
||||
Время повтора запросов ввода-вывода, неудачных из-за повреждения данных
|
||||
или незавершённых удалений EC-объектов (состояния PG has_incomplete).
|
||||
0 отключает повторы таких запросов и клиенты не блокируются, а вместо
|
||||
этого просто получают код ошибки EIO.
|
||||
- name: client_max_dirty_bytes
|
||||
type: int
|
||||
default: 33554432
|
||||
|
@@ -38,6 +38,7 @@ const types = {
|
||||
bool: 'boolean',
|
||||
int: 'integer',
|
||||
sec: 'seconds',
|
||||
float: 'number',
|
||||
ms: 'milliseconds',
|
||||
us: 'microseconds',
|
||||
},
|
||||
@@ -46,6 +47,7 @@ const types = {
|
||||
bool: 'булево (да/нет)',
|
||||
int: 'целое число',
|
||||
sec: 'секунды',
|
||||
float: 'число',
|
||||
ms: 'миллисекунды',
|
||||
us: 'микросекунды',
|
||||
},
|
||||
|
@@ -1,7 +1,7 @@
|
||||
- name: etcd_mon_ttl
|
||||
type: sec
|
||||
min: 10
|
||||
default: 30
|
||||
min: 5
|
||||
default: 1
|
||||
info: Monitor etcd lease refresh interval in seconds
|
||||
info_ru: Интервал обновления etcd резервации (lease) монитором
|
||||
- name: etcd_mon_timeout
|
||||
|
@@ -243,21 +243,6 @@
|
||||
Максимальное время ожидания ответа на запрос проверки состояния соединения.
|
||||
Если OSD не отвечает за это время, соединение отключается и производится
|
||||
повторная попытка соединения.
|
||||
- name: up_wait_retry_interval
|
||||
type: ms
|
||||
min: 50
|
||||
default: 500
|
||||
online: true
|
||||
info: |
|
||||
OSDs respond to clients with a special error code when they receive I/O
|
||||
requests for a PG that's not synchronized and started. This parameter sets
|
||||
the time for the clients to wait before re-attempting such I/O requests.
|
||||
info_ru: |
|
||||
Когда OSD получают от клиентов запросы ввода-вывода, относящиеся к не
|
||||
поднятым на данный момент на них PG, либо к PG в процессе синхронизации,
|
||||
они отвечают клиентам специальным кодом ошибки, означающим, что клиент
|
||||
должен некоторое время подождать перед повторением запроса. Именно это время
|
||||
ожидания задаёт данный параметр.
|
||||
- name: max_etcd_attempts
|
||||
type: int
|
||||
default: 5
|
||||
|
@@ -107,17 +107,29 @@
|
||||
принудительной отправкой fsync-а.
|
||||
- name: recovery_queue_depth
|
||||
type: int
|
||||
default: 4
|
||||
default: 1
|
||||
online: true
|
||||
info: |
|
||||
Maximum recovery operations per one primary OSD at any given moment of time.
|
||||
Currently it's the only parameter available to tune the speed or recovery
|
||||
and rebalancing, but it's planned to implement more.
|
||||
Maximum recovery and rebalance operations initiated by each OSD in parallel.
|
||||
Note that each OSD talks to a lot of other OSDs so actual number of parallel
|
||||
recovery operations per each OSD is greater than just recovery_queue_depth.
|
||||
Increasing this parameter can speedup recovery if [auto-tuning](#recovery_tune_interval)
|
||||
allows it or if it is disabled.
|
||||
info_ru: |
|
||||
Максимальное число операций восстановления на одном первичном OSD в любой
|
||||
момент времени. На данный момент единственный параметр, который можно менять
|
||||
для ускорения или замедления восстановления и перебалансировки данных, но
|
||||
в планах реализация других параметров.
|
||||
Максимальное число параллельных операций восстановления, инициируемых одним
|
||||
OSD в любой момент времени. Имейте в виду, что каждый OSD обычно работает с
|
||||
многими другими OSD, так что на практике параллелизм восстановления больше,
|
||||
чем просто recovery_queue_depth. Увеличение значения этого параметра может
|
||||
ускорить восстановление если [автотюнинг скорости](#recovery_tune_interval)
|
||||
разрешает это или если он отключён.
|
||||
- name: recovery_sleep_us
|
||||
type: us
|
||||
default: 0
|
||||
online: true
|
||||
info: |
|
||||
Delay for all recovery- and rebalance- related operations. If non-zero,
|
||||
such operations are artificially slowed down to reduce the impact on
|
||||
client I/O.
|
||||
- name: recovery_pg_switch
|
||||
type: int
|
||||
default: 128
|
||||
@@ -626,3 +638,112 @@
|
||||
считается некорректной. Однако, если "лучшую" версию с числом доступных
|
||||
копий большим, чем у всех других версий, найти невозможно, то объект тоже
|
||||
маркируется неконсистентным.
|
||||
- name: recovery_tune_interval
|
||||
type: sec
|
||||
default: 1
|
||||
online: true
|
||||
info: |
|
||||
Interval at which OSD re-considers client and recovery load and automatically
|
||||
adjusts [recovery_sleep_us](#recovery_sleep_us). Recovery auto-tuning is
|
||||
disabled if recovery_tune_interval is set to 0.
|
||||
|
||||
Auto-tuning targets utilization. Utilization is a measure of load and is
|
||||
equal to the product of iops and average latency (so it may be greater
|
||||
than 1). You set "low" and "high" client utilization thresholds and two
|
||||
corresponding target recovery utilization levels. OSD calculates desired
|
||||
recovery utilization from client utilization using linear interpolation
|
||||
and auto-tunes recovery operation delay to make actual recovery utilization
|
||||
match desired.
|
||||
|
||||
This allows to reduce recovery/rebalance impact on client operations. It is
|
||||
of course impossible to remove it completely, but it should become adequate.
|
||||
In some tests rebalance could earlier drop client write speed from 1.5 GB/s
|
||||
to 50-100 MB/s, with default auto-tuning settings it now only reduces
|
||||
to ~1 GB/s.
|
||||
info_ru: |
|
||||
Интервал, с которым OSD пересматривает клиентскую нагрузку и нагрузку
|
||||
восстановления и автоматически подстраивает [recovery_sleep_us](#recovery_sleep_us).
|
||||
Автотюнинг (автоподстройка) отключается, если recovery_tune_interval
|
||||
устанавливается в значение 0.
|
||||
|
||||
Автотюнинг регулирует утилизацию. Утилизация является мерой нагрузки
|
||||
и равна произведению числа операций в секунду и средней задержки
|
||||
(то есть, она может быть выше 1). Вы задаёте два уровня клиентской
|
||||
утилизации - "низкий" и "высокий" (low и high) и два соответствующих
|
||||
целевых уровня утилизации операциями восстановления. OSD рассчитывает
|
||||
желаемый уровень утилизации восстановления линейной интерполяцией от
|
||||
клиентской утилизации и подстраивает задержку операций восстановления
|
||||
так, чтобы фактическая утилизация восстановления совпадала с желаемой.
|
||||
|
||||
Это позволяет снизить влияние восстановления и ребаланса на клиентские
|
||||
операции. Конечно, невозможно исключить такое влияние полностью, но оно
|
||||
должно становиться адекватнее. В некоторых тестах перебалансировка могла
|
||||
снижать клиентскую скорость записи с 1.5 ГБ/с до 50-100 МБ/с, а теперь, с
|
||||
настройками автотюнинга по умолчанию, она снижается только до ~1 ГБ/с.
|
||||
- name: recovery_tune_util_low
|
||||
type: float
|
||||
default: 0.1
|
||||
online: true
|
||||
info: |
|
||||
Desired recovery/rebalance utilization when client load is high, i.e. when
|
||||
it is at or above recovery_tune_client_util_high.
|
||||
info_ru: |
|
||||
Желаемая утилизация восстановления в моменты, когда клиентская нагрузка
|
||||
высокая, то есть, находится на уровне или выше recovery_tune_client_util_high.
|
||||
- name: recovery_tune_util_high
|
||||
type: float
|
||||
default: 1
|
||||
online: true
|
||||
info: |
|
||||
Desired recovery/rebalance utilization when client load is low, i.e. when
|
||||
it is at or below recovery_tune_client_util_low.
|
||||
info_ru: |
|
||||
Желаемая утилизация восстановления в моменты, когда клиентская нагрузка
|
||||
низкая, то есть, находится на уровне или ниже recovery_tune_client_util_low.
|
||||
- name: recovery_tune_client_util_low
|
||||
type: float
|
||||
default: 0
|
||||
online: true
|
||||
info: Client utilization considered "low".
|
||||
info_ru: Клиентская утилизация, которая считается "низкой".
|
||||
- name: recovery_tune_client_util_high
|
||||
type: float
|
||||
default: 0.5
|
||||
online: true
|
||||
info: Client utilization considered "high".
|
||||
info_ru: Клиентская утилизация, которая считается "высокой".
|
||||
- name: recovery_tune_agg_interval
|
||||
type: int
|
||||
default: 10
|
||||
online: true
|
||||
info: |
|
||||
The number of last auto-tuning iterations to use for calculating the
|
||||
delay as average. Lower values result in quicker response to client
|
||||
load change, higher values result in more stable delay. Default value of 10
|
||||
is usually fine.
|
||||
info_ru: |
|
||||
Число последних итераций автоподстройки для расчёта задержки как среднего
|
||||
значения. Меньшие значения параметра ускоряют отклик на изменение нагрузки,
|
||||
большие значения делают задержку стабильнее. Значение по умолчанию 10
|
||||
обычно нормальное и не требует изменений.
|
||||
- name: recovery_tune_sleep_min_us
|
||||
type: us
|
||||
default: 10
|
||||
online: true
|
||||
info: |
|
||||
Minimum possible value for auto-tuned recovery_sleep_us. Lower values
|
||||
are changed to 0.
|
||||
info_ru: |
|
||||
Минимальное возможное значение авто-подстроенного recovery_sleep_us.
|
||||
Меньшие значения заменяются на 0.
|
||||
- name: recovery_tune_sleep_cutoff_us
|
||||
type: us
|
||||
default: 10000000
|
||||
online: true
|
||||
info: |
|
||||
Maximum possible value for auto-tuned recovery_sleep_us. Higher values
|
||||
are treated as outliers and ignored in aggregation.
|
||||
info_ru: |
|
||||
Максимальное возможное значение авто-подстроенного recovery_sleep_us.
|
||||
Большие значения считаются случайными выбросами и игнорируются в
|
||||
усреднении.
|
||||
|
@@ -37,6 +37,7 @@ Vitastor CSI supports:
|
||||
- Volume snapshots. Example: [snapshot class](../../csi/deploy/example-snapshot-class.yaml), [snapshot](../../csi/deploy/example-snapshot.yaml), [clone](../../csi/deploy/example-snapshot-clone.yaml)
|
||||
- [VDUSE](../usage/qemu.en.md#vduse) (preferred) and [NBD](../usage/nbd.en.md) device mapping methods
|
||||
- Upgrades with VDUSE - new handler processes are restarted when CSI pods are restarted themselves
|
||||
- VDUSE daemon auto-restart - handler processes are automatically restarted if they crash due to a bug in Vitastor client code
|
||||
- Multiple clusters by using multiple configuration files in ConfigMap.
|
||||
|
||||
Remember that to use snapshots with CSI you also have to install [Snapshot Controller and CRDs](https://kubernetes-csi.github.io/docs/snapshot-controller.html#deployment).
|
||||
|
@@ -37,6 +37,7 @@ CSI-плагин Vitastor поддерживает:
|
||||
- Снимки томов. Пример: [класс снимков](../../csi/deploy/example-snapshot-class.yaml), [снимок](../../csi/deploy/example-snapshot.yaml), [клон снимка](../../csi/deploy/example-snapshot-clone.yaml)
|
||||
- Способы подключения устройств [VDUSE](../usage/qemu.ru.md#vduse) (предпочитаемый) и [NBD](../usage/nbd.ru.md)
|
||||
- Обновление при использовании VDUSE - новые процессы-обработчики устройств успешно перезапускаются вместе с самими подами CSI
|
||||
- Автоперезауск демонов VDUSE - процесс-обработчик автоматически перезапустится, если он внезапно упадёт из-за бага в коде клиента Vitastor
|
||||
- Несколько кластеров через задание нескольких файлов конфигурации в ConfigMap.
|
||||
|
||||
Не забывайте, что для использования снимков нужно сначала установить [контроллер снимков и CRD](https://kubernetes-csi.github.io/docs/snapshot-controller.html#deployment).
|
||||
|
@@ -25,7 +25,7 @@ vitastor: vitastor
|
||||
vitastor_pool testpool
|
||||
# path to the configuration file
|
||||
vitastor_config_path /etc/vitastor/vitastor.conf
|
||||
# etcd address(es), required only if missing in the configuration file
|
||||
# etcd address(es), OPTIONAL, required only if missing in the configuration file
|
||||
vitastor_etcd_address 192.168.7.2:2379/v3
|
||||
# prefix for keys in etcd
|
||||
vitastor_etcd_prefix /vitastor
|
||||
|
@@ -24,7 +24,7 @@ vitastor: vitastor
|
||||
vitastor_pool testpool
|
||||
# Путь к файлу конфигурации
|
||||
vitastor_config_path /etc/vitastor/vitastor.conf
|
||||
# Адрес(а) etcd, нужны, только если не указаны в vitastor.conf
|
||||
# Адрес(а) etcd, ОПЦИОНАЛЬНЫ, нужны, только если не указаны в vitastor.conf
|
||||
vitastor_etcd_address 192.168.7.2:2379/v3
|
||||
# Префикс ключей метаданных в etcd
|
||||
vitastor_etcd_prefix /vitastor
|
||||
|
@@ -32,6 +32,7 @@
|
||||
- [Scrubbing](../config/osd.en.md#auto_scrub) (verification of copies)
|
||||
- [Checksums](../config/layout-osd.en.md#data_csum_type)
|
||||
- [Client write-back cache](../config/client.en.md#client_enable_writeback)
|
||||
- [Intelligent recovery auto-tuning](../config/osd.en.md#recovery_tune_interval)
|
||||
|
||||
## Plugins and tools
|
||||
|
||||
|
@@ -34,6 +34,7 @@
|
||||
- [Фоновая проверка целостности](../config/osd.ru.md#auto_scrub) (сверка копий)
|
||||
- [Контрольные суммы](../config/layout-osd.ru.md#data_csum_type)
|
||||
- [Буферизация записи на стороне клиента](../config/client.ru.md#client_enable_writeback)
|
||||
- [Интеллектуальная автоподстройка скорости восстановления](../config/osd.ru.md#recovery_tune_interval)
|
||||
|
||||
## Драйверы и инструменты
|
||||
|
||||
|
@@ -11,19 +11,26 @@ Replicated setups:
|
||||
- Single-threaded write+fsync latency:
|
||||
- With immediate commit: 2 network roundtrips + 1 disk write.
|
||||
- With lazy commit: 4 network roundtrips + 1 disk write + 1 disk flush.
|
||||
- Saturated parallel read iops: min(network bandwidth, sum(disk read iops)).
|
||||
- Saturated parallel write iops: min(network bandwidth, sum(disk write iops / number of replicas / write amplification)).
|
||||
- Linear read: `min(total network bandwidth, sum(disk read MB/s))`.
|
||||
- Linear write: `min(total network bandwidth, sum(disk write MB/s / number of replicas))`.
|
||||
- Saturated parallel read iops: `min(total network bandwidth, sum(disk read iops))`.
|
||||
- Saturated parallel write iops: `min(total network bandwidth / number of replicas, sum(disk write iops / number of replicas / (write amplification = 4)))`.
|
||||
|
||||
EC/XOR setups:
|
||||
EC/XOR setups (EC N+K):
|
||||
- Single-threaded (T1Q1) read latency: 1.5 network roundtrips + 1 disk read.
|
||||
- Single-threaded write+fsync latency:
|
||||
- With immediate commit: 3.5 network roundtrips + 1 disk read + 2 disk writes.
|
||||
- With lazy commit: 5.5 network roundtrips + 1 disk read + 2 disk writes + 2 disk fsyncs.
|
||||
- 0.5 in actually (k-1)/k which means that an additional roundtrip doesn't happen when
|
||||
- 0.5 in actually `(N-1)/N` which means that an additional roundtrip doesn't happen when
|
||||
the read sub-operation can be served locally.
|
||||
- Saturated parallel read iops: min(network bandwidth, sum(disk read iops)).
|
||||
- Saturated parallel write iops: min(network bandwidth, sum(disk write iops * number of data drives / (number of data + parity drives) / write amplification)).
|
||||
In fact, you should put disk write iops under the condition of ~10% reads / ~90% writes in this formula.
|
||||
- Linear read: `min(total network bandwidth, sum(disk read MB/s))`.
|
||||
- Linear write: `min(total network bandwidth, sum(disk write MB/s * N/(N+K)))`.
|
||||
- Saturated parallel read iops: `min(total network bandwidth, sum(disk read iops))`.
|
||||
- Saturated parallel write iops: roughly `total iops / (N+K) / WA`. More exactly,
|
||||
`min(total network bandwidth * N/(N+K), sum(disk randrw iops / (N*4 + K*5 + 1)))` with
|
||||
random read/write mix corresponding to `(N-1)/(N*4 + K*5 + 1)*100 % reads`.
|
||||
- For example, with EC 2+1 it is: `(7% randrw iops) / 14`.
|
||||
- With EC 6+3 it is: `(12.5% randrw iops) / 40`.
|
||||
|
||||
Write amplification for 4 KB blocks is usually 3-5 in Vitastor:
|
||||
1. Journal block write
|
||||
|
@@ -11,20 +11,27 @@
|
||||
- Запись+fsync в 1 поток:
|
||||
- С мгновенным сбросом: 2 RTT + 1 запись.
|
||||
- С отложенным ("ленивым") сбросом: 4 RTT + 1 запись + 1 fsync.
|
||||
- Параллельное чтение: сумма IOPS всех дисков либо производительность сети, если в сеть упрётся раньше.
|
||||
- Параллельная запись: сумма IOPS всех дисков / число реплик / WA либо производительность сети, если в сеть упрётся раньше.
|
||||
- Линейное чтение: сумма МБ/с чтения всех дисков, либо общая производительность сети (сумма пропускной способности сети всех нод), если в сеть упрётся раньше.
|
||||
- Линейная запись: сумма МБ/с записи всех дисков / число реплик, либо производительность сети / число реплик, если в сеть упрётся раньше.
|
||||
- Параллельное случайное мелкое чтение: сумма IOPS чтения всех дисков, либо производительность сети, если в сеть упрётся раньше.
|
||||
- Параллельная случайная мелкая запись: сумма IOPS записи всех дисков / число реплик / WA, либо производительность сети / число реплик, если в сеть упрётся раньше.
|
||||
|
||||
При использовании кодов коррекции ошибок (EC):
|
||||
При использовании кодов коррекции ошибок (EC N+K):
|
||||
- Задержка чтения в 1 поток (T1Q1): 1.5 RTT + 1 чтение.
|
||||
- Запись+fsync в 1 поток:
|
||||
- С мгновенным сбросом: 3.5 RTT + 1 чтение + 2 записи.
|
||||
- С отложенным ("ленивым") сбросом: 5.5 RTT + 1 чтение + 2 записи + 2 fsync.
|
||||
- Под 0.5 на самом деле подразумевается (k-1)/k, где k - число дисков данных,
|
||||
- Под 0.5 на самом деле подразумевается (N-1)/N, где N - число дисков данных,
|
||||
что означает, что дополнительное обращение по сети не нужно, когда операция
|
||||
чтения обслуживается локально.
|
||||
- Параллельное чтение: сумма IOPS всех дисков либо производительность сети, если в сеть упрётся раньше.
|
||||
- Параллельная запись: сумма IOPS всех дисков / общее число дисков данных и чётности / WA либо производительность сети, если в сеть упрётся раньше.
|
||||
Примечание: IOPS дисков в данном случае надо брать в смешанном режиме чтения/записи в пропорции, аналогичной формулам выше.
|
||||
- Линейное чтение: сумма МБ/с чтения всех дисков, либо общая производительность сети, если в сеть упрётся раньше.
|
||||
- Линейная запись: сумма МБ/с записи всех дисков * N/(N+K), либо производительность сети * N / (N+K), если в сеть упрётся раньше.
|
||||
- Параллельное случайное мелкое чтение: сумма IOPS чтения всех дисков либо производительность сети, если в сеть упрётся раньше.
|
||||
- Параллельная случайная мелкая запись: грубо `(сумма IOPS / (N+K) / WA)`. Если точнее, то:
|
||||
сумма смешанного IOPS всех дисков при `(N-1)/(N*4 + K*5 + 1)*100 %` чтения, делённая на `(N*4 + K*5 + 1)`.
|
||||
Либо, производительность сети * N/(N+K), если в сеть упрётся раньше.
|
||||
- Например, при EC 2+1 это: `(сумма IOPS при 7% чтения) / 14`.
|
||||
- При EC 6+3 это: `(сумма IOPS при 12.5% чтения) / 40`.
|
||||
|
||||
WA (мультипликатор записи) для 4 КБ блоков в Vitastor обычно составляет 3-5:
|
||||
1. Запись метаданных в журнал
|
||||
|
@@ -131,19 +131,18 @@ See also about [how to export snapshots](qemu.en.md#exporting-snapshots).
|
||||
|
||||
## modify
|
||||
|
||||
`vitastor-cli modify <name> [--rename <new-name>] [--resize <size>] [--readonly | --readwrite] [-f|--force]`
|
||||
`vitastor-cli modify <name> [--rename <new-name>] [--resize <size>] [--readonly | --readwrite] [-f|--force] [--down-ok]`
|
||||
|
||||
Rename, resize image or change its readonly status. Images with children can't be made read-write.
|
||||
If the new size is smaller than the old size, extra data will be purged.
|
||||
You should resize file system in the image, if present, before shrinking it.
|
||||
|
||||
```
|
||||
-f|--force Proceed with shrinking or setting readwrite flag even if the image has children.
|
||||
```
|
||||
| `-f|--force` | Proceed with shrinking or setting readwrite flag even if the image has children. |
|
||||
| `--down-ok` | Proceed with shrinking even if some data will be left on unavailable OSDs. |
|
||||
|
||||
## rm
|
||||
|
||||
`vitastor-cli rm <from> [<to>] [--writers-stopped]`
|
||||
`vitastor-cli rm <from> [<to>] [--writers-stopped] [--down-ok]`
|
||||
|
||||
Remove `<from>` or all layers between `<from>` and `<to>` (`<to>` must be a child of `<from>`),
|
||||
rebasing all their children accordingly. --writers-stopped allows merging to be a bit
|
||||
@@ -151,6 +150,10 @@ more effective in case of a single 'slim' read-write child and 'fat' removed par
|
||||
the child is merged into parent and parent is renamed to child in that case.
|
||||
In other cases parent layers are always merged into children.
|
||||
|
||||
Other options:
|
||||
|
||||
| `--down-ok` | Continue deletion/merging even if some data will be left on unavailable OSDs. |
|
||||
|
||||
## flatten
|
||||
|
||||
`vitastor-cli flatten <layer>`
|
||||
|
@@ -132,7 +132,7 @@ vitastor-cli snap-create [-p|--pool <id|name>] <image>@<snapshot>
|
||||
|
||||
## modify
|
||||
|
||||
`vitastor-cli modify <name> [--rename <new-name>] [--resize <size>] [--readonly | --readwrite] [-f|--force]`
|
||||
`vitastor-cli modify <name> [--rename <new-name>] [--resize <size>] [--readonly | --readwrite] [-f|--force] [--down-ok]`
|
||||
|
||||
Изменить размер, имя образа или флаг "только для чтения". Снимать флаг "только для чтения"
|
||||
и уменьшать размер образов, у которых есть дочерние клоны, без `--force` нельзя.
|
||||
@@ -140,13 +140,12 @@ vitastor-cli snap-create [-p|--pool <id|name>] <image>@<snapshot>
|
||||
Если новый размер меньше старого, "лишние" данные будут удалены, поэтому перед уменьшением
|
||||
образа сначала уменьшите файловую систему в нём.
|
||||
|
||||
```
|
||||
-f|--force Разрешить уменьшение или перевод в чтение-запись образа, у которого есть клоны.
|
||||
```
|
||||
| -f|--force | Разрешить уменьшение или перевод в чтение-запись образа, у которого есть клоны. |
|
||||
| --down-ok | Разрешить уменьшение, даже если часть данных останется неудалённой на недоступных OSD. |
|
||||
|
||||
## rm
|
||||
|
||||
`vitastor-cli rm <from> [<to>] [--writers-stopped]`
|
||||
`vitastor-cli rm <from> [<to>] [--writers-stopped] [--down-ok]`
|
||||
|
||||
Удалить образ `<from>` или все слои от `<from>` до `<to>` (`<to>` должен быть дочерним
|
||||
образом `<from>`), одновременно меняя родительские образы их клонов (если таковые есть).
|
||||
@@ -158,6 +157,10 @@ vitastor-cli snap-create [-p|--pool <id|name>] <image>@<snapshot>
|
||||
|
||||
В других случаях родительские слои вливаются в дочерние.
|
||||
|
||||
Другие опции:
|
||||
|
||||
| `--down-ok` | Продолжать удаление/слияние, даже если часть данных останется неудалённой на недоступных OSD. |
|
||||
|
||||
## flatten
|
||||
|
||||
`vitastor-cli flatten <layer>`
|
||||
|
@@ -261,7 +261,7 @@ Options (see also [Cluster-Wide Disk Layout Parameters](../config/layout-cluster
|
||||
```
|
||||
--object_size 128k Set blockstore block size
|
||||
--bitmap_granularity 4k Set bitmap granularity
|
||||
--journal_size 16M Set journal size
|
||||
--journal_size 32M Set journal size
|
||||
--data_csum_type none Set data checksum type (crc32c or none)
|
||||
--csum_block_size 4k Set data checksum block size
|
||||
--device_block_size 4k Set device block size
|
||||
|
@@ -267,7 +267,7 @@ OSD отключены fsync-и.
|
||||
```
|
||||
--object_size 128k Размер блока хранилища
|
||||
--bitmap_granularity 4k Гранулярность битовых карт
|
||||
--journal_size 16M Размер журнала
|
||||
--journal_size 32M Размер журнала
|
||||
--data_csum_type none Задать тип контрольных сумм (crc32c или none)
|
||||
--csum_block_size 4k Задать размер блока расчёта контрольных сумм
|
||||
--device_block_size 4k Размер блока устройства
|
||||
|
@@ -14,10 +14,13 @@ Vitastor has a fio driver which can be installed from the package vitastor-fio.
|
||||
Use the following command as an example to run tests with fio against a Vitastor cluster:
|
||||
|
||||
```
|
||||
fio -thread -ioengine=libfio_vitastor.so -name=test -bs=4M -direct=1 -iodepth=16 -rw=write -etcd=10.115.0.10:2379/v3 -image=testimg
|
||||
fio -thread -ioengine=libfio_vitastor.so -name=test -bs=4M -direct=1 -iodepth=16 -rw=write -image=testimg
|
||||
```
|
||||
|
||||
If you don't want to access your image by name, you can specify pool number, inode number and size
|
||||
(`-pool=1 -inode=1 -size=400G`) instead of the image name (`-image=testimg`).
|
||||
|
||||
See exact fio commands to use for benchmarking [here](../performance/understanding.en.md#команды-fio).
|
||||
You can also specify etcd address(es) explicitly by adding `-etcd=10.115.0.10:2379/v3`, or you
|
||||
can override configuration file path by adding `-conf=/etc/vitastor/vitastor.conf`.
|
||||
|
||||
See exact fio commands to use for benchmarking [here](../performance/understanding.en.md#fio-commands).
|
||||
|
@@ -14,10 +14,13 @@
|
||||
Используйте следующую команду как пример для запуска тестов кластера Vitastor через fio:
|
||||
|
||||
```
|
||||
fio -thread -ioengine=libfio_vitastor.so -name=test -bs=4M -direct=1 -iodepth=16 -rw=write -etcd=10.115.0.10:2379/v3 -image=testimg
|
||||
fio -thread -ioengine=libfio_vitastor.so -name=test -bs=4M -direct=1 -iodepth=16 -rw=write -image=testimg
|
||||
```
|
||||
|
||||
Вместо обращения к образу по имени (`-image=testimg`) можно указать номер пула, номер инода и размер:
|
||||
`-pool=1 -inode=1 -size=400G`.
|
||||
|
||||
Вы также можете задать адрес(а) подключения к etcd явно, добавив `-etcd=10.115.0.10:2379/v3`,
|
||||
или переопределить путь к файлу конфигурации, добавив `-conf=/etc/vitastor/vitastor.conf`.
|
||||
|
||||
Конкретные команды fio для тестирования производительности можно посмотреть [здесь](../performance/understanding.ru.md#команды-fio).
|
||||
|
@@ -34,7 +34,7 @@ vitastor-nfs [STANDARD OPTIONS] [OTHER OPTIONS]
|
||||
--foreground 1 stay in foreground, do not daemonize
|
||||
```
|
||||
|
||||
Example start and mount commands:
|
||||
Example start and mount commands (etcd_address is optional):
|
||||
|
||||
```
|
||||
vitastor-nfs --etcd_address 192.168.5.10:2379 --portmap 0 --port 2050 --pool testpool
|
||||
|
@@ -33,7 +33,7 @@ vitastor-nfs [СТАНДАРТНЫЕ ОПЦИИ] [ДРУГИЕ ОПЦИИ]
|
||||
--foreground 1 не уходить в фон после запуска
|
||||
```
|
||||
|
||||
Пример монтирования Vitastor через NFS:
|
||||
Пример монтирования Vitastor через NFS (etcd_address необязателен):
|
||||
|
||||
```
|
||||
vitastor-nfs --etcd_address 192.168.5.10:2379 --portmap 0 --port 2050 --pool testpool
|
||||
|
@@ -16,13 +16,16 @@ Old syntax (-drive):
|
||||
|
||||
```
|
||||
qemu-system-x86_64 -enable-kvm -m 1024 \
|
||||
-drive 'file=vitastor:etcd_host=192.168.7.2\:2379/v3:image=debian9',
|
||||
-drive 'file=vitastor:image=debian9',
|
||||
format=raw,if=none,id=drive-virtio-disk0,cache=none \
|
||||
-device 'virtio-blk-pci,scsi=off,bus=pci.0,addr=0x5,drive=drive-virtio-disk0,
|
||||
id=virtio-disk0,bootindex=1,write-cache=off' \
|
||||
-vnc 0.0.0.0:0
|
||||
```
|
||||
|
||||
Etcd address may be specified explicitly by adding `:etcd_host=192.168.7.2\:2379/v3` to `file=`.
|
||||
Configuration file path may be overriden by adding `:config_path=/etc/vitastor/vitastor.conf`.
|
||||
|
||||
New syntax (-blockdev):
|
||||
|
||||
```
|
||||
@@ -50,12 +53,12 @@ You can also specify inode ID, pool and size manually instead of `:image=<IMAGE>
|
||||
|
||||
## qemu-img
|
||||
|
||||
For qemu-img, you should use `vitastor:etcd_host=<HOST>:image=<IMAGE>` as filename.
|
||||
For qemu-img, you should use `vitastor:image=<IMAGE>[:etcd_host=<HOST>]` as filename.
|
||||
|
||||
For example, to upload a VM image into Vitastor, run:
|
||||
|
||||
```
|
||||
qemu-img convert -f qcow2 debian10.qcow2 -p -O raw 'vitastor:etcd_host=192.168.7.2\:2379/v3:image=debian10'
|
||||
qemu-img convert -f qcow2 debian10.qcow2 -p -O raw 'vitastor:image=debian10'
|
||||
```
|
||||
|
||||
You can also specify `:pool=<POOL>:inode=<INODE>:size=<SIZE>` instead of `:image=<IMAGE>`
|
||||
@@ -72,10 +75,10 @@ the snapshot separately using the following commands (key points are using `skip
|
||||
`-B backing_file` option):
|
||||
|
||||
```
|
||||
qemu-img convert -f raw 'vitastor:etcd_host=192.168.7.2\:2379/v3:image=testimg@0' \
|
||||
qemu-img convert -f raw 'vitastor:image=testimg@0' \
|
||||
-O qcow2 testimg_0.qcow2
|
||||
|
||||
qemu-img convert -f raw 'vitastor:etcd_host=192.168.7.2\:2379/v3:image=testimg:skip-parents=1' \
|
||||
qemu-img convert -f raw 'vitastor:image=testimg:skip-parents=1' \
|
||||
-O qcow2 -o 'cluster_size=4k' -B testimg_0.qcow2 testimg.qcow2
|
||||
```
|
||||
|
||||
|
@@ -18,13 +18,16 @@
|
||||
|
||||
```
|
||||
qemu-system-x86_64 -enable-kvm -m 1024 \
|
||||
-drive 'file=vitastor:etcd_host=192.168.7.2\:2379/v3:image=debian9',
|
||||
-drive 'file=vitastor:image=debian9',
|
||||
format=raw,if=none,id=drive-virtio-disk0,cache=none \
|
||||
-device 'virtio-blk-pci,scsi=off,bus=pci.0,addr=0x5,drive=drive-virtio-disk0,
|
||||
id=virtio-disk0,bootindex=1,write-cache=off' \
|
||||
-vnc 0.0.0.0:0
|
||||
```
|
||||
|
||||
Адрес подключения etcd можно задать явно, если добавить `:etcd_host=192.168.7.2\:2379/v3` к `file=`.
|
||||
Путь к файлу конфигурации можно переопределить, добавив `:config_path=/etc/vitastor/vitastor.conf`.
|
||||
|
||||
Новый синтаксис (-blockdev):
|
||||
|
||||
```
|
||||
@@ -52,12 +55,12 @@ qemu-system-x86_64 -enable-kvm -m 1024 \
|
||||
|
||||
## qemu-img
|
||||
|
||||
Для qemu-img используйте строку `vitastor:etcd_host=<HOST>:image=<IMAGE>` в качестве имени файла диска.
|
||||
Для qemu-img используйте строку `vitastor:image=<IMAGE>[:etcd_host=<HOST>]` в качестве имени файла диска.
|
||||
|
||||
Например, чтобы загрузить образ диска в Vitastor:
|
||||
|
||||
```
|
||||
qemu-img convert -f qcow2 debian10.qcow2 -p -O raw 'vitastor:etcd_host=10.115.0.10\:2379/v3:image=testimg'
|
||||
qemu-img convert -f qcow2 debian10.qcow2 -p -O raw 'vitastor:image=testimg'
|
||||
```
|
||||
|
||||
Если вы не хотите обращаться к образу по имени, вместо `:image=<IMAGE>` можно указать номер пула, номер инода и размер:
|
||||
@@ -73,10 +76,10 @@ qemu-img convert -f qcow2 debian10.qcow2 -p -O raw 'vitastor:etcd_host=10.115.0.
|
||||
с помощью следующих команд (ключевые моменты - использование `skip-parents=1` и опции `-B backing_file.qcow2`):
|
||||
|
||||
```
|
||||
qemu-img convert -f raw 'vitastor:etcd_host=192.168.7.2\:2379/v3:image=testimg@0' \
|
||||
qemu-img convert -f raw 'vitastor:image=testimg@0' \
|
||||
-O qcow2 testimg_0.qcow2
|
||||
|
||||
qemu-img convert -f raw 'vitastor:etcd_host=192.168.7.2\:2379/v3:image=testimg:skip-parents=1' \
|
||||
qemu-img convert -f raw 'vitastor:image=testimg:skip-parents=1' \
|
||||
-O qcow2 -o 'cluster_size=4k' -B testimg_0.qcow2 testimg.qcow2
|
||||
```
|
||||
|
||||
|
@@ -3,6 +3,7 @@
|
||||
|
||||
module.exports = {
|
||||
scale_pg_count,
|
||||
scale_pg_history,
|
||||
};
|
||||
|
||||
function add_pg_history(new_pg_history, new_pg, prev_pgs, prev_pg_history, old_pg)
|
||||
@@ -43,16 +44,18 @@ function finish_pg_history(merged_history)
|
||||
merged_history.all_peers = Object.values(merged_history.all_peers);
|
||||
}
|
||||
|
||||
function scale_pg_count(prev_pgs, real_prev_pgs, prev_pg_history, new_pg_history, new_pg_count)
|
||||
function scale_pg_history(prev_pg_history, prev_pgs, new_pgs)
|
||||
{
|
||||
const old_pg_count = real_prev_pgs.length;
|
||||
const new_pg_history = [];
|
||||
const old_pg_count = prev_pgs.length;
|
||||
const new_pg_count = new_pgs.length;
|
||||
// Add all possibly intersecting PGs to the history of new PGs
|
||||
if (!(new_pg_count % old_pg_count))
|
||||
{
|
||||
// New PG count is a multiple of old PG count
|
||||
for (let i = 0; i < new_pg_count; i++)
|
||||
{
|
||||
add_pg_history(new_pg_history, i, real_prev_pgs, prev_pg_history, i % old_pg_count);
|
||||
add_pg_history(new_pg_history, i, prev_pgs, prev_pg_history, i % old_pg_count);
|
||||
finish_pg_history(new_pg_history[i]);
|
||||
}
|
||||
}
|
||||
@@ -64,7 +67,7 @@ function scale_pg_count(prev_pgs, real_prev_pgs, prev_pg_history, new_pg_history
|
||||
{
|
||||
for (let j = 0; j < mul; j++)
|
||||
{
|
||||
add_pg_history(new_pg_history, i, real_prev_pgs, prev_pg_history, i+j*new_pg_count);
|
||||
add_pg_history(new_pg_history, i, prev_pgs, prev_pg_history, i+j*new_pg_count);
|
||||
}
|
||||
finish_pg_history(new_pg_history[i]);
|
||||
}
|
||||
@@ -76,7 +79,7 @@ function scale_pg_count(prev_pgs, real_prev_pgs, prev_pg_history, new_pg_history
|
||||
let merged_history = {};
|
||||
for (let i = 0; i < old_pg_count; i++)
|
||||
{
|
||||
add_pg_history(merged_history, 1, real_prev_pgs, prev_pg_history, i);
|
||||
add_pg_history(merged_history, 1, prev_pgs, prev_pg_history, i);
|
||||
}
|
||||
finish_pg_history(merged_history[1]);
|
||||
for (let i = 0; i < new_pg_count; i++)
|
||||
@@ -89,6 +92,12 @@ function scale_pg_count(prev_pgs, real_prev_pgs, prev_pg_history, new_pg_history
|
||||
{
|
||||
new_pg_history[i] = null;
|
||||
}
|
||||
return new_pg_history;
|
||||
}
|
||||
|
||||
function scale_pg_count(prev_pgs, new_pg_count)
|
||||
{
|
||||
const old_pg_count = prev_pgs.length;
|
||||
// Just for the lp_solve optimizer - pick a "previous" PG for each "new" one
|
||||
if (prev_pgs.length < new_pg_count)
|
||||
{
|
||||
|
419
mon/mon.js
419
mon/mon.js
@@ -55,10 +55,11 @@ const etcd_tree = {
|
||||
// etcd connection - configurable online
|
||||
etcd_address: "10.0.115.10:2379/v3",
|
||||
// mon
|
||||
etcd_mon_ttl: 30, // min: 10
|
||||
etcd_mon_ttl: 5, // min: 1
|
||||
etcd_mon_timeout: 1000, // ms. min: 0
|
||||
etcd_mon_retries: 5, // min: 0
|
||||
mon_change_timeout: 1000, // ms. min: 100
|
||||
mon_retry_change_timeout: 50, // ms. min: 10
|
||||
mon_stats_timeout: 1000, // ms. min: 100
|
||||
osd_out_time: 600, // seconds. min: 0
|
||||
placement_levels: { datacenter: 1, rack: 2, host: 3, osd: 4, ... },
|
||||
@@ -85,13 +86,14 @@ const etcd_tree = {
|
||||
client_max_buffered_bytes: 33554432,
|
||||
client_max_buffered_ops: 1024,
|
||||
client_max_writeback_iodepth: 256,
|
||||
client_retry_interval: 50, // ms. min: 10
|
||||
client_eio_retry_interval: 1000, // ms
|
||||
// client and osd - configurable online
|
||||
log_level: 0,
|
||||
peer_connect_interval: 5, // seconds. min: 1
|
||||
peer_connect_timeout: 5, // seconds. min: 1
|
||||
osd_idle_timeout: 5, // seconds. min: 1
|
||||
osd_ping_timeout: 5, // seconds. min: 1
|
||||
up_wait_retry_interval: 500, // ms. min: 50
|
||||
max_etcd_attempts: 5,
|
||||
etcd_quick_timeout: 1000, // ms
|
||||
etcd_slow_timeout: 5000, // ms
|
||||
@@ -110,7 +112,15 @@ const etcd_tree = {
|
||||
autosync_interval: 5,
|
||||
autosync_writes: 128,
|
||||
client_queue_depth: 128, // unused
|
||||
recovery_queue_depth: 4,
|
||||
recovery_queue_depth: 1,
|
||||
recovery_sleep_us: 0,
|
||||
recovery_tune_util_low: 0.1,
|
||||
recovery_tune_client_util_low: 0,
|
||||
recovery_tune_util_high: 1.0,
|
||||
recovery_tune_client_util_high: 0.5,
|
||||
recovery_tune_interval: 1,
|
||||
recovery_tune_agg_interval: 10, // 10 times recovery_tune_interval
|
||||
recovery_tune_sleep_min_us: 10, // 10 microseconds
|
||||
recovery_pg_switch: 128,
|
||||
recovery_sync_batch: 16,
|
||||
no_recovery: false,
|
||||
@@ -381,7 +391,8 @@ class Mon
|
||||
{
|
||||
constructor(config)
|
||||
{
|
||||
this.die = (e) => this._die(e);
|
||||
this.failconnect = (e) => this._die(e, 2);
|
||||
this.die = (e) => this._die(e, 1);
|
||||
if (fs.existsSync(config.config_path||'/etc/vitastor/vitastor.conf'))
|
||||
{
|
||||
config = {
|
||||
@@ -392,7 +403,7 @@ class Mon
|
||||
this.parse_etcd_addresses(config.etcd_address||config.etcd_url);
|
||||
this.verbose = config.verbose || 0;
|
||||
this.initConfig = config;
|
||||
this.config = {};
|
||||
this.config = { ...config };
|
||||
this.etcd_prefix = config.etcd_prefix || '/vitastor';
|
||||
this.etcd_prefix = this.etcd_prefix.replace(/\/\/+/g, '/').replace(/^\/?(.*[^\/])\/?$/, '/$1');
|
||||
this.etcd_start_timeout = (config.etcd_start_timeout || 5) * 1000;
|
||||
@@ -470,10 +481,10 @@ class Mon
|
||||
|
||||
check_config()
|
||||
{
|
||||
this.config.etcd_mon_ttl = Number(this.config.etcd_mon_ttl) || 30;
|
||||
if (this.config.etcd_mon_ttl < 10)
|
||||
this.config.etcd_mon_ttl = Number(this.config.etcd_mon_ttl) || 5;
|
||||
if (this.config.etcd_mon_ttl < 1)
|
||||
{
|
||||
this.config.etcd_mon_ttl = 10;
|
||||
this.config.etcd_mon_ttl = 1;
|
||||
}
|
||||
this.config.etcd_mon_timeout = Number(this.config.etcd_mon_timeout) || 0;
|
||||
if (this.config.etcd_mon_timeout <= 0)
|
||||
@@ -490,6 +501,11 @@ class Mon
|
||||
{
|
||||
this.config.mon_change_timeout = 100;
|
||||
}
|
||||
this.config.mon_retry_change_timeout = Number(this.config.mon_retry_change_timeout) || 50;
|
||||
if (this.config.mon_retry_change_timeout < 50)
|
||||
{
|
||||
this.config.mon_retry_change_timeout = 50;
|
||||
}
|
||||
this.config.mon_stats_timeout = Number(this.config.mon_stats_timeout) || 1000;
|
||||
if (this.config.mon_stats_timeout < 100)
|
||||
{
|
||||
@@ -590,7 +606,7 @@ class Mon
|
||||
}
|
||||
if (!this.ws)
|
||||
{
|
||||
this.die('Failed to open etcd watch websocket');
|
||||
this.failconnect('Failed to open etcd watch websocket');
|
||||
}
|
||||
const cur_addr = this.selected_etcd_url;
|
||||
this.ws_alive = true;
|
||||
@@ -606,7 +622,7 @@ class Mon
|
||||
console.log('etcd websocket timed out, restarting it');
|
||||
this.restart_watcher(cur_addr);
|
||||
}
|
||||
}, (Number(this.config.etcd_keepalive_interval) || 30)*1000);
|
||||
}, (Number(this.config.etcd_ws_keepalive_interval) || 30)*1000);
|
||||
this.ws.on('error', () => this.restart_watcher(cur_addr));
|
||||
this.ws.send(JSON.stringify({
|
||||
create_request: {
|
||||
@@ -660,7 +676,12 @@ class Mon
|
||||
{
|
||||
this.parse_kv(e.kv);
|
||||
const key = e.kv.key.substr(this.etcd_prefix.length);
|
||||
if (key.substr(0, 11) == '/osd/stats/' || key.substr(0, 10) == '/pg/stats/' || key.substr(0, 16) == '/osd/inodestats/')
|
||||
if (key.substr(0, 11) == '/osd/state/')
|
||||
{
|
||||
stats_changed = true;
|
||||
changed = true;
|
||||
}
|
||||
else if (key.substr(0, 11) == '/osd/stats/' || key.substr(0, 10) == '/pg/stats/' || key.substr(0, 16) == '/osd/inodestats/')
|
||||
{
|
||||
stats_changed = true;
|
||||
}
|
||||
@@ -777,9 +798,9 @@ class Mon
|
||||
const res = await this.etcd_call('/lease/keepalive', { ID: this.etcd_lease_id }, this.config.etcd_mon_timeout, this.config.etcd_mon_retries);
|
||||
if (!res.result.TTL)
|
||||
{
|
||||
this.die('Lease expired');
|
||||
this.failconnect('Lease expired');
|
||||
}
|
||||
}, this.config.etcd_mon_timeout);
|
||||
}, this.config.etcd_mon_ttl*1000);
|
||||
if (!this.signals_set)
|
||||
{
|
||||
process.on('SIGINT', this.on_stop_cb);
|
||||
@@ -1222,6 +1243,89 @@ class Mon
|
||||
return aff_osds;
|
||||
}
|
||||
|
||||
async generate_pool_pgs(pool_id, osd_tree, levels)
|
||||
{
|
||||
const pool_cfg = this.state.config.pools[pool_id];
|
||||
if (!this.validate_pool_cfg(pool_id, pool_cfg, false))
|
||||
{
|
||||
return null;
|
||||
}
|
||||
let pool_tree = osd_tree[pool_cfg.root_node || ''];
|
||||
pool_tree = pool_tree ? pool_tree.children : [];
|
||||
pool_tree = LPOptimizer.flatten_tree(pool_tree, levels, pool_cfg.failure_domain, 'osd');
|
||||
this.filter_osds_by_tags(osd_tree, pool_tree, pool_cfg.osd_tags);
|
||||
this.filter_osds_by_block_layout(
|
||||
pool_tree,
|
||||
pool_cfg.block_size || this.config.block_size || 131072,
|
||||
pool_cfg.bitmap_granularity || this.config.bitmap_granularity || 4096,
|
||||
pool_cfg.immediate_commit || this.config.immediate_commit || 'none'
|
||||
);
|
||||
// First try last_clean_pgs to minimize data movement
|
||||
let prev_pgs = [];
|
||||
for (const pg in ((this.state.history.last_clean_pgs.items||{})[pool_id]||{}))
|
||||
{
|
||||
prev_pgs[pg-1] = [ ...this.state.history.last_clean_pgs.items[pool_id][pg].osd_set ];
|
||||
}
|
||||
if (!prev_pgs.length)
|
||||
{
|
||||
// Fall back to config/pgs if it's empty
|
||||
for (const pg in ((this.state.config.pgs.items||{})[pool_id]||{}))
|
||||
{
|
||||
prev_pgs[pg-1] = [ ...this.state.config.pgs.items[pool_id][pg].osd_set ];
|
||||
}
|
||||
}
|
||||
const old_pg_count = prev_pgs.length;
|
||||
const optimize_cfg = {
|
||||
osd_tree: pool_tree,
|
||||
pg_count: pool_cfg.pg_count,
|
||||
pg_size: pool_cfg.pg_size,
|
||||
pg_minsize: pool_cfg.pg_minsize,
|
||||
max_combinations: pool_cfg.max_osd_combinations,
|
||||
ordered: pool_cfg.scheme != 'replicated',
|
||||
};
|
||||
let optimize_result;
|
||||
// Re-shuffle PGs if config/pgs.hash is empty
|
||||
if (old_pg_count > 0 && this.state.config.pgs.hash)
|
||||
{
|
||||
if (prev_pgs.length != pool_cfg.pg_count)
|
||||
{
|
||||
// Scale PG count
|
||||
// Do it even if old_pg_count is already equal to pool_cfg.pg_count,
|
||||
// because last_clean_pgs may still contain the old number of PGs
|
||||
PGUtil.scale_pg_count(prev_pgs, pool_cfg.pg_count);
|
||||
}
|
||||
for (const pg of prev_pgs)
|
||||
{
|
||||
while (pg.length < pool_cfg.pg_size)
|
||||
{
|
||||
pg.push(0);
|
||||
}
|
||||
}
|
||||
optimize_result = await LPOptimizer.optimize_change({
|
||||
prev_pgs,
|
||||
...optimize_cfg,
|
||||
});
|
||||
}
|
||||
else
|
||||
{
|
||||
optimize_result = await LPOptimizer.optimize_initial(optimize_cfg);
|
||||
}
|
||||
console.log(`Pool ${pool_id} (${pool_cfg.name || 'unnamed'}):`);
|
||||
LPOptimizer.print_change_stats(optimize_result);
|
||||
const pg_effsize = Math.min(pool_cfg.pg_size, Object.keys(pool_tree).length);
|
||||
return {
|
||||
pool_id,
|
||||
pgs: optimize_result.int_pgs,
|
||||
stats: {
|
||||
total_raw_tb: optimize_result.space,
|
||||
pg_real_size: pg_effsize || pool_cfg.pg_size,
|
||||
raw_to_usable: (pg_effsize || pool_cfg.pg_size) / (pool_cfg.scheme === 'replicated'
|
||||
? 1 : (pool_cfg.pg_size - (pool_cfg.parity_chunks||0))),
|
||||
space_efficiency: optimize_result.space/(optimize_result.total_space||1),
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
async recheck_pgs()
|
||||
{
|
||||
if (this.recheck_pgs_active)
|
||||
@@ -1236,158 +1340,47 @@ class Mon
|
||||
const { up_osds, levels, osd_tree } = this.get_osd_tree();
|
||||
const tree_cfg = {
|
||||
osd_tree,
|
||||
levels,
|
||||
pools: this.state.config.pools,
|
||||
};
|
||||
const tree_hash = sha1hex(stableStringify(tree_cfg));
|
||||
if (this.state.config.pgs.hash != tree_hash)
|
||||
{
|
||||
// Something has changed
|
||||
const new_config_pgs = JSON.parse(JSON.stringify(this.state.config.pgs));
|
||||
const etcd_request = { compare: [], success: [] };
|
||||
for (const pool_id in (this.state.config.pgs||{}).items||{})
|
||||
console.log('Pool configuration or OSD tree changed, re-optimizing');
|
||||
// First re-optimize PGs, but don't look at history yet
|
||||
const optimize_results = await Promise.all(Object.keys(this.state.config.pools)
|
||||
.map(pool_id => this.generate_pool_pgs(pool_id, osd_tree, levels)));
|
||||
// Then apply the modification in the form of an optimistic transaction,
|
||||
// each time considering new pg/history modifications (OSDs modify it during rebalance)
|
||||
while (!await this.apply_pool_pgs(optimize_results, up_osds, osd_tree, tree_hash))
|
||||
{
|
||||
if (!this.state.config.pools[pool_id])
|
||||
{
|
||||
// Pool deleted. Delete all PGs, but first stop them.
|
||||
if (!await this.stop_all_pgs(pool_id))
|
||||
{
|
||||
this.recheck_pgs_active = false;
|
||||
this.schedule_recheck();
|
||||
return;
|
||||
}
|
||||
const prev_pgs = [];
|
||||
for (const pg in this.state.config.pgs.items[pool_id]||{})
|
||||
{
|
||||
prev_pgs[pg-1] = this.state.config.pgs.items[pool_id][pg].osd_set;
|
||||
}
|
||||
// Also delete pool statistics
|
||||
etcd_request.success.push({ requestDeleteRange: {
|
||||
key: b64(this.etcd_prefix+'/pool/stats/'+pool_id),
|
||||
} });
|
||||
this.save_new_pgs_txn(new_config_pgs, etcd_request, pool_id, up_osds, osd_tree, prev_pgs, [], []);
|
||||
}
|
||||
}
|
||||
for (const pool_id in this.state.config.pools)
|
||||
{
|
||||
const pool_cfg = this.state.config.pools[pool_id];
|
||||
if (!this.validate_pool_cfg(pool_id, pool_cfg, false))
|
||||
{
|
||||
continue;
|
||||
}
|
||||
let pool_tree = osd_tree[pool_cfg.root_node || ''];
|
||||
pool_tree = pool_tree ? pool_tree.children : [];
|
||||
pool_tree = LPOptimizer.flatten_tree(pool_tree, levels, pool_cfg.failure_domain, 'osd');
|
||||
this.filter_osds_by_tags(osd_tree, pool_tree, pool_cfg.osd_tags);
|
||||
this.filter_osds_by_block_layout(
|
||||
pool_tree,
|
||||
pool_cfg.block_size || this.config.block_size || 131072,
|
||||
pool_cfg.bitmap_granularity || this.config.bitmap_granularity || 4096,
|
||||
pool_cfg.immediate_commit || this.config.immediate_commit || 'none'
|
||||
console.log(
|
||||
'Someone changed PG configuration while we also tried to change it.'+
|
||||
' Retrying in '+this.config.mon_retry_change_timeout+' ms'
|
||||
);
|
||||
// These are for the purpose of building history.osd_sets
|
||||
const real_prev_pgs = [];
|
||||
let pg_history = [];
|
||||
for (const pg in ((this.state.config.pgs.items||{})[pool_id]||{}))
|
||||
// Failed to apply - parallel change detected. Wait a bit and retry
|
||||
const old_rev = this.etcd_watch_revision;
|
||||
while (this.etcd_watch_revision === old_rev)
|
||||
{
|
||||
real_prev_pgs[pg-1] = this.state.config.pgs.items[pool_id][pg].osd_set;
|
||||
if (this.state.pg.history[pool_id] &&
|
||||
this.state.pg.history[pool_id][pg])
|
||||
{
|
||||
pg_history[pg-1] = this.state.pg.history[pool_id][pg];
|
||||
}
|
||||
await new Promise(ok => setTimeout(ok, this.config.mon_retry_change_timeout));
|
||||
}
|
||||
// And these are for the purpose of minimizing data movement
|
||||
let prev_pgs = [];
|
||||
for (const pg in ((this.state.history.last_clean_pgs.items||{})[pool_id]||{}))
|
||||
{
|
||||
prev_pgs[pg-1] = this.state.history.last_clean_pgs.items[pool_id][pg].osd_set;
|
||||
}
|
||||
prev_pgs = JSON.parse(JSON.stringify(prev_pgs.length ? prev_pgs : real_prev_pgs));
|
||||
const old_pg_count = real_prev_pgs.length;
|
||||
const optimize_cfg = {
|
||||
osd_tree: pool_tree,
|
||||
pg_count: pool_cfg.pg_count,
|
||||
pg_size: pool_cfg.pg_size,
|
||||
pg_minsize: pool_cfg.pg_minsize,
|
||||
max_combinations: pool_cfg.max_osd_combinations,
|
||||
ordered: pool_cfg.scheme != 'replicated',
|
||||
const new_ot = this.get_osd_tree();
|
||||
const new_tcfg = {
|
||||
osd_tree: new_ot.osd_tree,
|
||||
levels: new_ot.levels,
|
||||
pools: this.state.config.pools,
|
||||
};
|
||||
let optimize_result;
|
||||
if (old_pg_count > 0)
|
||||
if (sha1hex(stableStringify(new_tcfg)) !== tree_hash)
|
||||
{
|
||||
if (old_pg_count != pool_cfg.pg_count)
|
||||
{
|
||||
// PG count changed. Need to bring all PGs down.
|
||||
if (!await this.stop_all_pgs(pool_id))
|
||||
{
|
||||
this.recheck_pgs_active = false;
|
||||
this.schedule_recheck();
|
||||
return;
|
||||
}
|
||||
}
|
||||
if (prev_pgs.length != pool_cfg.pg_count)
|
||||
{
|
||||
// Scale PG count
|
||||
// Do it even if old_pg_count is already equal to pool_cfg.pg_count,
|
||||
// because last_clean_pgs may still contain the old number of PGs
|
||||
const new_pg_history = [];
|
||||
PGUtil.scale_pg_count(prev_pgs, real_prev_pgs, pg_history, new_pg_history, pool_cfg.pg_count);
|
||||
pg_history = new_pg_history;
|
||||
}
|
||||
for (const pg of prev_pgs)
|
||||
{
|
||||
while (pg.length < pool_cfg.pg_size)
|
||||
{
|
||||
pg.push(0);
|
||||
}
|
||||
}
|
||||
if (!this.state.config.pgs.hash)
|
||||
{
|
||||
// Re-shuffle PGs
|
||||
optimize_result = await LPOptimizer.optimize_initial(optimize_cfg);
|
||||
}
|
||||
else
|
||||
{
|
||||
optimize_result = await LPOptimizer.optimize_change({
|
||||
prev_pgs,
|
||||
...optimize_cfg,
|
||||
});
|
||||
}
|
||||
// Configuration actually changed, restart from the beginning
|
||||
this.recheck_pgs_active = false;
|
||||
setImmediate(() => this.recheck_pgs().catch(this.die));
|
||||
return;
|
||||
}
|
||||
else
|
||||
{
|
||||
optimize_result = await LPOptimizer.optimize_initial(optimize_cfg);
|
||||
}
|
||||
if (old_pg_count != optimize_result.int_pgs.length)
|
||||
{
|
||||
console.log(
|
||||
`PG count for pool ${pool_id} (${pool_cfg.name || 'unnamed'})`+
|
||||
` changed from: ${old_pg_count} to ${optimize_result.int_pgs.length}`
|
||||
);
|
||||
// Drop stats
|
||||
etcd_request.success.push({ requestDeleteRange: {
|
||||
key: b64(this.etcd_prefix+'/pg/stats/'+pool_id+'/'),
|
||||
range_end: b64(this.etcd_prefix+'/pg/stats/'+pool_id+'0'),
|
||||
} });
|
||||
}
|
||||
LPOptimizer.print_change_stats(optimize_result);
|
||||
const pg_effsize = Math.min(pool_cfg.pg_size, Object.keys(pool_tree).length);
|
||||
this.state.pool.stats[pool_id] = {
|
||||
used_raw_tb: (this.state.pool.stats[pool_id]||{}).used_raw_tb || 0,
|
||||
total_raw_tb: optimize_result.space,
|
||||
pg_real_size: pg_effsize || pool_cfg.pg_size,
|
||||
raw_to_usable: (pg_effsize || pool_cfg.pg_size) / (pool_cfg.scheme === 'replicated'
|
||||
? 1 : (pool_cfg.pg_size - (pool_cfg.parity_chunks||0))),
|
||||
space_efficiency: optimize_result.space/(optimize_result.total_space||1),
|
||||
};
|
||||
etcd_request.success.push({ requestPut: {
|
||||
key: b64(this.etcd_prefix+'/pool/stats/'+pool_id),
|
||||
value: b64(JSON.stringify(this.state.pool.stats[pool_id])),
|
||||
} });
|
||||
this.save_new_pgs_txn(new_config_pgs, etcd_request, pool_id, up_osds, osd_tree, real_prev_pgs, optimize_result.int_pgs, pg_history);
|
||||
// Configuration didn't change, PG history probably changed, so just retry
|
||||
}
|
||||
new_config_pgs.hash = tree_hash;
|
||||
await this.save_pg_config(new_config_pgs, etcd_request);
|
||||
console.log('PG configuration successfully changed');
|
||||
}
|
||||
else
|
||||
{
|
||||
@@ -1428,12 +1421,97 @@ class Mon
|
||||
}
|
||||
if (changed)
|
||||
{
|
||||
await this.save_pg_config(new_config_pgs);
|
||||
const ok = await this.save_pg_config(new_config_pgs);
|
||||
if (ok)
|
||||
console.log('PG configuration successfully changed');
|
||||
else
|
||||
{
|
||||
console.log('Someone changed PG configuration while we also tried to change it. Retrying in '+this.config.mon_change_timeout+' ms');
|
||||
this.schedule_recheck();
|
||||
}
|
||||
}
|
||||
}
|
||||
this.recheck_pgs_active = false;
|
||||
}
|
||||
|
||||
async apply_pool_pgs(results, up_osds, osd_tree, tree_hash)
|
||||
{
|
||||
for (const pool_id in (this.state.config.pgs||{}).items||{})
|
||||
{
|
||||
// We should stop all PGs when deleting a pool or changing its PG count
|
||||
if (!this.state.config.pools[pool_id] ||
|
||||
this.state.config.pgs.items[pool_id] && this.state.config.pools[pool_id].pg_count !=
|
||||
Object.keys(this.state.config.pgs.items[pool_id]).reduce((a, c) => (a < (0|c) ? (0|c) : a), 0))
|
||||
{
|
||||
if (!await this.stop_all_pgs(pool_id))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
const new_config_pgs = JSON.parse(JSON.stringify(this.state.config.pgs));
|
||||
const etcd_request = { compare: [], success: [] };
|
||||
for (const pool_id in (new_config_pgs||{}).items||{})
|
||||
{
|
||||
if (!this.state.config.pools[pool_id])
|
||||
{
|
||||
const prev_pgs = [];
|
||||
for (const pg in new_config_pgs.items[pool_id]||{})
|
||||
{
|
||||
prev_pgs[pg-1] = new_config_pgs.items[pool_id][pg].osd_set;
|
||||
}
|
||||
// Also delete pool statistics
|
||||
etcd_request.success.push({ requestDeleteRange: {
|
||||
key: b64(this.etcd_prefix+'/pool/stats/'+pool_id),
|
||||
} });
|
||||
this.save_new_pgs_txn(new_config_pgs, etcd_request, pool_id, up_osds, osd_tree, prev_pgs, [], []);
|
||||
}
|
||||
}
|
||||
for (const pool_res of results)
|
||||
{
|
||||
const pool_id = pool_res.pool_id;
|
||||
const pool_cfg = this.state.config.pools[pool_id];
|
||||
let pg_history = [];
|
||||
for (const pg in ((this.state.config.pgs.items||{})[pool_id]||{}))
|
||||
{
|
||||
if (this.state.pg.history[pool_id] &&
|
||||
this.state.pg.history[pool_id][pg])
|
||||
{
|
||||
pg_history[pg-1] = this.state.pg.history[pool_id][pg];
|
||||
}
|
||||
}
|
||||
const real_prev_pgs = [];
|
||||
for (const pg in ((this.state.config.pgs.items||{})[pool_id]||{}))
|
||||
{
|
||||
real_prev_pgs[pg-1] = [ ...this.state.config.pgs.items[pool_id][pg].osd_set ];
|
||||
}
|
||||
if (real_prev_pgs.length > 0 && real_prev_pgs.length != pool_res.pgs.length)
|
||||
{
|
||||
console.log(
|
||||
`Changing PG count for pool ${pool_id} (${pool_cfg.name || 'unnamed'})`+
|
||||
` from: ${real_prev_pgs.length} to ${pool_res.pgs.length}`
|
||||
);
|
||||
pg_history = PGUtil.scale_pg_history(pg_history, real_prev_pgs, pool_res.pgs);
|
||||
// Drop stats
|
||||
etcd_request.success.push({ requestDeleteRange: {
|
||||
key: b64(this.etcd_prefix+'/pg/stats/'+pool_id+'/'),
|
||||
range_end: b64(this.etcd_prefix+'/pg/stats/'+pool_id+'0'),
|
||||
} });
|
||||
}
|
||||
const stats = {
|
||||
used_raw_tb: (this.state.pool.stats[pool_id]||{}).used_raw_tb || 0,
|
||||
...pool_res.stats,
|
||||
};
|
||||
etcd_request.success.push({ requestPut: {
|
||||
key: b64(this.etcd_prefix+'/pool/stats/'+pool_id),
|
||||
value: b64(JSON.stringify(stats)),
|
||||
} });
|
||||
this.save_new_pgs_txn(new_config_pgs, etcd_request, pool_id, up_osds, osd_tree, real_prev_pgs, pool_res.pgs, pg_history);
|
||||
}
|
||||
new_config_pgs.hash = tree_hash;
|
||||
return await this.save_pg_config(new_config_pgs, etcd_request);
|
||||
}
|
||||
|
||||
async save_pg_config(new_config_pgs, etcd_request = { compare: [], success: [] })
|
||||
{
|
||||
etcd_request.compare.push(
|
||||
@@ -1443,14 +1521,8 @@ class Mon
|
||||
etcd_request.success.push(
|
||||
{ requestPut: { key: b64(this.etcd_prefix+'/config/pgs'), value: b64(JSON.stringify(new_config_pgs)) } },
|
||||
);
|
||||
const res = await this.etcd_call('/kv/txn', etcd_request, this.config.etcd_mon_timeout, 0);
|
||||
if (!res.succeeded)
|
||||
{
|
||||
console.log('Someone changed PG configuration while we also tried to change it. Retrying in '+this.config.mon_change_timeout+' ms');
|
||||
this.schedule_recheck();
|
||||
return;
|
||||
}
|
||||
console.log('PG configuration successfully changed');
|
||||
const txn_res = await this.etcd_call('/kv/txn', etcd_request, this.config.etcd_mon_timeout, 0);
|
||||
return txn_res.succeeded;
|
||||
}
|
||||
|
||||
// Schedule next recheck at least at <unixtime>
|
||||
@@ -1569,9 +1641,13 @@ class Mon
|
||||
}
|
||||
const sum_diff = { op_stats: {}, subop_stats: {}, recovery_stats: {} };
|
||||
// Sum derived values instead of deriving summed
|
||||
for (const osd in this.state.osd.stats)
|
||||
for (const osd in this.state.osd.state)
|
||||
{
|
||||
const derived = this.prev_stats.osd_diff[osd];
|
||||
if (!this.state.osd.state[osd] || !derived)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
for (const type in sum_diff)
|
||||
{
|
||||
for (const op in derived[type]||{})
|
||||
@@ -1672,9 +1748,13 @@ class Mon
|
||||
const used = this.state.pool.stats[pool_id].used_raw_tb;
|
||||
this.state.pool.stats[pool_id].used_raw_tb = Number(used)/1024/1024/1024/1024;
|
||||
}
|
||||
for (const osd_num in this.state.osd.inodestats)
|
||||
for (const osd_num in this.state.osd.state)
|
||||
{
|
||||
const ist = this.state.osd.inodestats[osd_num];
|
||||
if (!ist || !this.state.osd.state[osd_num])
|
||||
{
|
||||
continue;
|
||||
}
|
||||
for (const pool_id in ist)
|
||||
{
|
||||
inode_stats[pool_id] = inode_stats[pool_id] || {};
|
||||
@@ -1690,9 +1770,14 @@ class Mon
|
||||
}
|
||||
}
|
||||
}
|
||||
for (const osd in this.prev_stats.osd_diff)
|
||||
for (const osd in this.state.osd.state)
|
||||
{
|
||||
for (const pool_id in this.prev_stats.osd_diff[osd].inode_stats)
|
||||
const osd_diff = this.prev_stats.osd_diff[osd];
|
||||
if (!osd_diff || !this.state.osd.state[osd])
|
||||
{
|
||||
continue;
|
||||
}
|
||||
for (const pool_id in osd_diff.inode_stats)
|
||||
{
|
||||
for (const inode_num in this.prev_stats.osd_diff[osd].inode_stats[pool_id])
|
||||
{
|
||||
@@ -1932,14 +2017,14 @@ class Mon
|
||||
return res.json;
|
||||
}
|
||||
}
|
||||
this.die();
|
||||
this.failconnect();
|
||||
}
|
||||
|
||||
_die(err)
|
||||
_die(err, code)
|
||||
{
|
||||
// In fact we can just try to rejoin
|
||||
console.error(new Error(err || 'Cluster connection failed'));
|
||||
process.exit(1);
|
||||
process.exit(code || 2);
|
||||
}
|
||||
|
||||
local_ips(all)
|
||||
|
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "vitastor-mon",
|
||||
"version": "1.3.1",
|
||||
"version": "1.4.8",
|
||||
"description": "Vitastor SDS monitor service",
|
||||
"main": "mon-main.js",
|
||||
"scripts": {
|
||||
|
@@ -8,7 +8,9 @@ PartOf=vitastor.target
|
||||
LimitNOFILE=1048576
|
||||
LimitNPROC=1048576
|
||||
LimitMEMLOCK=infinity
|
||||
ExecStart=bash -c 'exec vitastor-disk exec-osd /dev/vitastor/osd%i-data >>/var/log/vitastor/osd%i.log 2>&1'
|
||||
# Use the following for direct logs to files
|
||||
#ExecStart=bash -c 'exec vitastor-disk exec-osd /dev/vitastor/osd%i-data >>/var/log/vitastor/osd%i.log 2>&1'
|
||||
ExecStart=vitastor-disk exec-osd /dev/vitastor/osd%i-data
|
||||
ExecStartPre=+vitastor-disk pre-exec /dev/vitastor/osd%i-data
|
||||
WorkingDirectory=/
|
||||
User=vitastor
|
||||
|
@@ -110,7 +110,6 @@ sub properties
|
||||
vitastor_etcd_address => {
|
||||
description => 'IP address(es) of etcd.',
|
||||
type => 'string',
|
||||
format => 'pve-storage-portal-dns-list',
|
||||
},
|
||||
vitastor_etcd_prefix => {
|
||||
description => 'Prefix for Vitastor etcd metadata',
|
||||
|
@@ -50,7 +50,7 @@ from cinder.volume import configuration
|
||||
from cinder.volume import driver
|
||||
from cinder.volume import volume_utils
|
||||
|
||||
VERSION = '1.3.1'
|
||||
VERSION = '1.4.8'
|
||||
|
||||
LOG = logging.getLogger(__name__)
|
||||
|
||||
|
692
patches/libvirt-8.0-vitastor.diff
Normal file
692
patches/libvirt-8.0-vitastor.diff
Normal file
@@ -0,0 +1,692 @@
|
||||
commit d85024bd803b3b91f15578ed22de4ce31856626f
|
||||
Author: Vitaliy Filippov <vitalif@yourcmc.ru>
|
||||
Date: Wed Jan 24 18:07:43 2024 +0300
|
||||
|
||||
Add Vitastor support
|
||||
|
||||
diff --git a/docs/schemas/domaincommon.rng b/docs/schemas/domaincommon.rng
|
||||
index 7fa5c2b8b5..2d77f391e7 100644
|
||||
--- a/docs/schemas/domaincommon.rng
|
||||
+++ b/docs/schemas/domaincommon.rng
|
||||
@@ -1898,6 +1898,35 @@
|
||||
</element>
|
||||
</define>
|
||||
|
||||
+ <define name="diskSourceNetworkProtocolVitastor">
|
||||
+ <element name="source">
|
||||
+ <interleave>
|
||||
+ <attribute name="protocol">
|
||||
+ <value>vitastor</value>
|
||||
+ </attribute>
|
||||
+ <ref name="diskSourceCommon"/>
|
||||
+ <optional>
|
||||
+ <attribute name="name"/>
|
||||
+ </optional>
|
||||
+ <optional>
|
||||
+ <attribute name="query"/>
|
||||
+ </optional>
|
||||
+ <zeroOrMore>
|
||||
+ <ref name="diskSourceNetworkHost"/>
|
||||
+ </zeroOrMore>
|
||||
+ <optional>
|
||||
+ <element name="config">
|
||||
+ <attribute name="file">
|
||||
+ <ref name="absFilePath"/>
|
||||
+ </attribute>
|
||||
+ <empty/>
|
||||
+ </element>
|
||||
+ </optional>
|
||||
+ <empty/>
|
||||
+ </interleave>
|
||||
+ </element>
|
||||
+ </define>
|
||||
+
|
||||
<define name="diskSourceNetworkProtocolISCSI">
|
||||
<element name="source">
|
||||
<attribute name="protocol">
|
||||
@@ -2154,6 +2183,7 @@
|
||||
<ref name="diskSourceNetworkProtocolSimple"/>
|
||||
<ref name="diskSourceNetworkProtocolVxHS"/>
|
||||
<ref name="diskSourceNetworkProtocolNFS"/>
|
||||
+ <ref name="diskSourceNetworkProtocolVitastor"/>
|
||||
</choice>
|
||||
</define>
|
||||
|
||||
diff --git a/include/libvirt/libvirt-storage.h b/include/libvirt/libvirt-storage.h
|
||||
index f89856b93e..a8cb9387e2 100644
|
||||
--- a/include/libvirt/libvirt-storage.h
|
||||
+++ b/include/libvirt/libvirt-storage.h
|
||||
@@ -246,6 +246,7 @@ typedef enum {
|
||||
VIR_CONNECT_LIST_STORAGE_POOLS_ZFS = 1 << 17,
|
||||
VIR_CONNECT_LIST_STORAGE_POOLS_VSTORAGE = 1 << 18,
|
||||
VIR_CONNECT_LIST_STORAGE_POOLS_ISCSI_DIRECT = 1 << 19,
|
||||
+ VIR_CONNECT_LIST_STORAGE_POOLS_VITASTOR = 1 << 20,
|
||||
} virConnectListAllStoragePoolsFlags;
|
||||
|
||||
int virConnectListAllStoragePools(virConnectPtr conn,
|
||||
diff --git a/src/conf/domain_conf.c b/src/conf/domain_conf.c
|
||||
index 5691b8d2d5..6669e8451d 100644
|
||||
--- a/src/conf/domain_conf.c
|
||||
+++ b/src/conf/domain_conf.c
|
||||
@@ -8293,7 +8293,8 @@ virDomainDiskSourceNetworkParse(xmlNodePtr node,
|
||||
src->configFile = virXPathString("string(./config/@file)", ctxt);
|
||||
|
||||
if (src->protocol == VIR_STORAGE_NET_PROTOCOL_HTTP ||
|
||||
- src->protocol == VIR_STORAGE_NET_PROTOCOL_HTTPS)
|
||||
+ src->protocol == VIR_STORAGE_NET_PROTOCOL_HTTPS ||
|
||||
+ src->protocol == VIR_STORAGE_NET_PROTOCOL_VITASTOR)
|
||||
src->query = virXMLPropString(node, "query");
|
||||
|
||||
if (virDomainStorageNetworkParseHosts(node, ctxt, &src->hosts, &src->nhosts) < 0)
|
||||
@@ -31267,6 +31268,7 @@ virDomainStorageSourceTranslateSourcePool(virStorageSource *src,
|
||||
|
||||
case VIR_STORAGE_POOL_MPATH:
|
||||
case VIR_STORAGE_POOL_RBD:
|
||||
+ case VIR_STORAGE_POOL_VITASTOR:
|
||||
case VIR_STORAGE_POOL_SHEEPDOG:
|
||||
case VIR_STORAGE_POOL_GLUSTER:
|
||||
case VIR_STORAGE_POOL_LAST:
|
||||
diff --git a/src/conf/domain_validate.c b/src/conf/domain_validate.c
|
||||
index a4271f1247..621c1b7b31 100644
|
||||
--- a/src/conf/domain_validate.c
|
||||
+++ b/src/conf/domain_validate.c
|
||||
@@ -508,7 +508,7 @@ virDomainDiskDefValidateSourceChainOne(const virStorageSource *src)
|
||||
}
|
||||
}
|
||||
|
||||
- /* internal snapshots and config files are currently supported only with rbd: */
|
||||
+ /* internal snapshots are currently supported only with rbd: */
|
||||
if (virStorageSourceGetActualType(src) != VIR_STORAGE_TYPE_NETWORK &&
|
||||
src->protocol != VIR_STORAGE_NET_PROTOCOL_RBD) {
|
||||
if (src->snapshot) {
|
||||
@@ -517,11 +517,15 @@ virDomainDiskDefValidateSourceChainOne(const virStorageSource *src)
|
||||
"only with 'rbd' disks"));
|
||||
return -1;
|
||||
}
|
||||
-
|
||||
+ }
|
||||
+ /* config files are currently supported only with rbd and vitastor: */
|
||||
+ if (virStorageSourceGetActualType(src) != VIR_STORAGE_TYPE_NETWORK &&
|
||||
+ src->protocol != VIR_STORAGE_NET_PROTOCOL_RBD &&
|
||||
+ src->protocol != VIR_STORAGE_NET_PROTOCOL_VITASTOR) {
|
||||
if (src->configFile) {
|
||||
virReportError(VIR_ERR_XML_ERROR, "%s",
|
||||
_("<config> element is currently supported "
|
||||
- "only with 'rbd' disks"));
|
||||
+ "only with 'rbd' and 'vitastor' disks"));
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
diff --git a/src/conf/storage_conf.c b/src/conf/storage_conf.c
|
||||
index 6690d26ffd..2255df9d28 100644
|
||||
--- a/src/conf/storage_conf.c
|
||||
+++ b/src/conf/storage_conf.c
|
||||
@@ -60,7 +60,7 @@ VIR_ENUM_IMPL(virStoragePool,
|
||||
"logical", "disk", "iscsi",
|
||||
"iscsi-direct", "scsi", "mpath",
|
||||
"rbd", "sheepdog", "gluster",
|
||||
- "zfs", "vstorage",
|
||||
+ "zfs", "vstorage", "vitastor",
|
||||
);
|
||||
|
||||
VIR_ENUM_IMPL(virStoragePoolFormatFileSystem,
|
||||
@@ -246,6 +246,18 @@ static virStoragePoolTypeInfo poolTypeInfo[] = {
|
||||
.formatToString = virStorageFileFormatTypeToString,
|
||||
}
|
||||
},
|
||||
+ {.poolType = VIR_STORAGE_POOL_VITASTOR,
|
||||
+ .poolOptions = {
|
||||
+ .flags = (VIR_STORAGE_POOL_SOURCE_HOST |
|
||||
+ VIR_STORAGE_POOL_SOURCE_NETWORK |
|
||||
+ VIR_STORAGE_POOL_SOURCE_NAME),
|
||||
+ },
|
||||
+ .volOptions = {
|
||||
+ .defaultFormat = VIR_STORAGE_FILE_RAW,
|
||||
+ .formatFromString = virStorageVolumeFormatFromString,
|
||||
+ .formatToString = virStorageFileFormatTypeToString,
|
||||
+ }
|
||||
+ },
|
||||
{.poolType = VIR_STORAGE_POOL_SHEEPDOG,
|
||||
.poolOptions = {
|
||||
.flags = (VIR_STORAGE_POOL_SOURCE_HOST |
|
||||
@@ -546,6 +558,11 @@ virStoragePoolDefParseSource(xmlXPathContextPtr ctxt,
|
||||
_("element 'name' is mandatory for RBD pool"));
|
||||
return -1;
|
||||
}
|
||||
+ if (pool_type == VIR_STORAGE_POOL_VITASTOR && source->name == NULL) {
|
||||
+ virReportError(VIR_ERR_XML_ERROR, "%s",
|
||||
+ _("element 'name' is mandatory for Vitastor pool"));
|
||||
+ return -1;
|
||||
+ }
|
||||
|
||||
if (options->formatFromString) {
|
||||
g_autofree char *format = NULL;
|
||||
@@ -1176,6 +1193,7 @@ virStoragePoolDefFormatBuf(virBuffer *buf,
|
||||
/* RBD, Sheepdog, Gluster and Iscsi-direct devices are not local block devs nor
|
||||
* files, so they don't have a target */
|
||||
if (def->type != VIR_STORAGE_POOL_RBD &&
|
||||
+ def->type != VIR_STORAGE_POOL_VITASTOR &&
|
||||
def->type != VIR_STORAGE_POOL_SHEEPDOG &&
|
||||
def->type != VIR_STORAGE_POOL_GLUSTER &&
|
||||
def->type != VIR_STORAGE_POOL_ISCSI_DIRECT) {
|
||||
diff --git a/src/conf/storage_conf.h b/src/conf/storage_conf.h
|
||||
index aaecf138d6..97172db38b 100644
|
||||
--- a/src/conf/storage_conf.h
|
||||
+++ b/src/conf/storage_conf.h
|
||||
@@ -106,6 +106,7 @@ typedef enum {
|
||||
VIR_STORAGE_POOL_GLUSTER, /* Gluster device */
|
||||
VIR_STORAGE_POOL_ZFS, /* ZFS */
|
||||
VIR_STORAGE_POOL_VSTORAGE, /* Virtuozzo Storage */
|
||||
+ VIR_STORAGE_POOL_VITASTOR, /* Vitastor */
|
||||
|
||||
VIR_STORAGE_POOL_LAST,
|
||||
} virStoragePoolType;
|
||||
@@ -466,6 +467,7 @@ VIR_ENUM_DECL(virStoragePartedFs);
|
||||
VIR_CONNECT_LIST_STORAGE_POOLS_SCSI | \
|
||||
VIR_CONNECT_LIST_STORAGE_POOLS_MPATH | \
|
||||
VIR_CONNECT_LIST_STORAGE_POOLS_RBD | \
|
||||
+ VIR_CONNECT_LIST_STORAGE_POOLS_VITASTOR | \
|
||||
VIR_CONNECT_LIST_STORAGE_POOLS_SHEEPDOG | \
|
||||
VIR_CONNECT_LIST_STORAGE_POOLS_GLUSTER | \
|
||||
VIR_CONNECT_LIST_STORAGE_POOLS_ZFS | \
|
||||
diff --git a/src/conf/storage_source_conf.c b/src/conf/storage_source_conf.c
|
||||
index d42f715f26..29d8da3d10 100644
|
||||
--- a/src/conf/storage_source_conf.c
|
||||
+++ b/src/conf/storage_source_conf.c
|
||||
@@ -86,6 +86,7 @@ VIR_ENUM_IMPL(virStorageNetProtocol,
|
||||
"ssh",
|
||||
"vxhs",
|
||||
"nfs",
|
||||
+ "vitastor",
|
||||
);
|
||||
|
||||
|
||||
@@ -1265,6 +1266,7 @@ virStorageSourceNetworkDefaultPort(virStorageNetProtocol protocol)
|
||||
case VIR_STORAGE_NET_PROTOCOL_GLUSTER:
|
||||
return 24007;
|
||||
|
||||
+ case VIR_STORAGE_NET_PROTOCOL_VITASTOR:
|
||||
case VIR_STORAGE_NET_PROTOCOL_RBD:
|
||||
/* we don't provide a default for RBD */
|
||||
return 0;
|
||||
diff --git a/src/conf/storage_source_conf.h b/src/conf/storage_source_conf.h
|
||||
index c4a026881c..67568e9181 100644
|
||||
--- a/src/conf/storage_source_conf.h
|
||||
+++ b/src/conf/storage_source_conf.h
|
||||
@@ -128,6 +128,7 @@ typedef enum {
|
||||
VIR_STORAGE_NET_PROTOCOL_SSH,
|
||||
VIR_STORAGE_NET_PROTOCOL_VXHS,
|
||||
VIR_STORAGE_NET_PROTOCOL_NFS,
|
||||
+ VIR_STORAGE_NET_PROTOCOL_VITASTOR,
|
||||
|
||||
VIR_STORAGE_NET_PROTOCOL_LAST
|
||||
} virStorageNetProtocol;
|
||||
diff --git a/src/conf/virstorageobj.c b/src/conf/virstorageobj.c
|
||||
index 02903ac487..504df599fb 100644
|
||||
--- a/src/conf/virstorageobj.c
|
||||
+++ b/src/conf/virstorageobj.c
|
||||
@@ -1481,6 +1481,7 @@ virStoragePoolObjSourceFindDuplicateCb(const void *payload,
|
||||
return 1;
|
||||
break;
|
||||
|
||||
+ case VIR_STORAGE_POOL_VITASTOR:
|
||||
case VIR_STORAGE_POOL_RBD:
|
||||
case VIR_STORAGE_POOL_LAST:
|
||||
break;
|
||||
@@ -1978,6 +1979,8 @@ virStoragePoolObjMatch(virStoragePoolObj *obj,
|
||||
(obj->def->type == VIR_STORAGE_POOL_MPATH)) ||
|
||||
(MATCH(VIR_CONNECT_LIST_STORAGE_POOLS_RBD) &&
|
||||
(obj->def->type == VIR_STORAGE_POOL_RBD)) ||
|
||||
+ (MATCH(VIR_CONNECT_LIST_STORAGE_POOLS_VITASTOR) &&
|
||||
+ (obj->def->type == VIR_STORAGE_POOL_VITASTOR)) ||
|
||||
(MATCH(VIR_CONNECT_LIST_STORAGE_POOLS_SHEEPDOG) &&
|
||||
(obj->def->type == VIR_STORAGE_POOL_SHEEPDOG)) ||
|
||||
(MATCH(VIR_CONNECT_LIST_STORAGE_POOLS_GLUSTER) &&
|
||||
diff --git a/src/libvirt-storage.c b/src/libvirt-storage.c
|
||||
index cbc522b300..b4760fa58d 100644
|
||||
--- a/src/libvirt-storage.c
|
||||
+++ b/src/libvirt-storage.c
|
||||
@@ -92,6 +92,7 @@ virStoragePoolGetConnect(virStoragePoolPtr pool)
|
||||
* VIR_CONNECT_LIST_STORAGE_POOLS_SCSI
|
||||
* VIR_CONNECT_LIST_STORAGE_POOLS_MPATH
|
||||
* VIR_CONNECT_LIST_STORAGE_POOLS_RBD
|
||||
+ * VIR_CONNECT_LIST_STORAGE_POOLS_VITASTOR
|
||||
* VIR_CONNECT_LIST_STORAGE_POOLS_SHEEPDOG
|
||||
* VIR_CONNECT_LIST_STORAGE_POOLS_GLUSTER
|
||||
* VIR_CONNECT_LIST_STORAGE_POOLS_ZFS
|
||||
diff --git a/src/libxl/libxl_conf.c b/src/libxl/libxl_conf.c
|
||||
index 1ac6253ad7..abe4587f94 100644
|
||||
--- a/src/libxl/libxl_conf.c
|
||||
+++ b/src/libxl/libxl_conf.c
|
||||
@@ -962,6 +962,7 @@ libxlMakeNetworkDiskSrcStr(virStorageSource *src,
|
||||
case VIR_STORAGE_NET_PROTOCOL_SSH:
|
||||
case VIR_STORAGE_NET_PROTOCOL_VXHS:
|
||||
case VIR_STORAGE_NET_PROTOCOL_NFS:
|
||||
+ case VIR_STORAGE_NET_PROTOCOL_VITASTOR:
|
||||
case VIR_STORAGE_NET_PROTOCOL_LAST:
|
||||
case VIR_STORAGE_NET_PROTOCOL_NONE:
|
||||
virReportError(VIR_ERR_NO_SUPPORT,
|
||||
diff --git a/src/libxl/xen_xl.c b/src/libxl/xen_xl.c
|
||||
index 7604e3d534..6453bb9776 100644
|
||||
--- a/src/libxl/xen_xl.c
|
||||
+++ b/src/libxl/xen_xl.c
|
||||
@@ -1506,6 +1506,7 @@ xenFormatXLDiskSrcNet(virStorageSource *src)
|
||||
case VIR_STORAGE_NET_PROTOCOL_SSH:
|
||||
case VIR_STORAGE_NET_PROTOCOL_VXHS:
|
||||
case VIR_STORAGE_NET_PROTOCOL_NFS:
|
||||
+ case VIR_STORAGE_NET_PROTOCOL_VITASTOR:
|
||||
case VIR_STORAGE_NET_PROTOCOL_LAST:
|
||||
case VIR_STORAGE_NET_PROTOCOL_NONE:
|
||||
virReportError(VIR_ERR_NO_SUPPORT,
|
||||
diff --git a/src/qemu/qemu_block.c b/src/qemu/qemu_block.c
|
||||
index e5ff653a60..884ecc79ea 100644
|
||||
--- a/src/qemu/qemu_block.c
|
||||
+++ b/src/qemu/qemu_block.c
|
||||
@@ -943,6 +943,38 @@ qemuBlockStorageSourceGetRBDProps(virStorageSource *src,
|
||||
}
|
||||
|
||||
|
||||
+static virJSONValue *
|
||||
+qemuBlockStorageSourceGetVitastorProps(virStorageSource *src)
|
||||
+{
|
||||
+ virJSONValue *ret = NULL;
|
||||
+ virStorageNetHostDef *host;
|
||||
+ size_t i;
|
||||
+ g_auto(virBuffer) buf = VIR_BUFFER_INITIALIZER;
|
||||
+ g_autofree char *etcd = NULL;
|
||||
+
|
||||
+ for (i = 0; i < src->nhosts; i++) {
|
||||
+ host = src->hosts + i;
|
||||
+ if ((virStorageNetHostTransport)host->transport != VIR_STORAGE_NET_HOST_TRANS_TCP) {
|
||||
+ return NULL;
|
||||
+ }
|
||||
+ virBufferAsprintf(&buf, i > 0 ? ",%s:%u" : "%s:%u", host->name, host->port);
|
||||
+ }
|
||||
+ if (src->nhosts > 0) {
|
||||
+ etcd = virBufferContentAndReset(&buf);
|
||||
+ }
|
||||
+
|
||||
+ if (virJSONValueObjectCreate(&ret,
|
||||
+ "S:etcd-host", etcd,
|
||||
+ "S:etcd-prefix", src->query,
|
||||
+ "S:config-path", src->configFile,
|
||||
+ "s:image", src->path,
|
||||
+ NULL) < 0)
|
||||
+ return NULL;
|
||||
+
|
||||
+ return ret;
|
||||
+}
|
||||
+
|
||||
+
|
||||
static virJSONValue *
|
||||
qemuBlockStorageSourceGetSheepdogProps(virStorageSource *src)
|
||||
{
|
||||
@@ -1233,6 +1265,12 @@ qemuBlockStorageSourceGetBackendProps(virStorageSource *src,
|
||||
return NULL;
|
||||
break;
|
||||
|
||||
+ case VIR_STORAGE_NET_PROTOCOL_VITASTOR:
|
||||
+ driver = "vitastor";
|
||||
+ if (!(fileprops = qemuBlockStorageSourceGetVitastorProps(src)))
|
||||
+ return NULL;
|
||||
+ break;
|
||||
+
|
||||
case VIR_STORAGE_NET_PROTOCOL_SHEEPDOG:
|
||||
driver = "sheepdog";
|
||||
if (!(fileprops = qemuBlockStorageSourceGetSheepdogProps(src)))
|
||||
@@ -2244,6 +2282,7 @@ qemuBlockGetBackingStoreString(virStorageSource *src,
|
||||
|
||||
case VIR_STORAGE_NET_PROTOCOL_SHEEPDOG:
|
||||
case VIR_STORAGE_NET_PROTOCOL_RBD:
|
||||
+ case VIR_STORAGE_NET_PROTOCOL_VITASTOR:
|
||||
case VIR_STORAGE_NET_PROTOCOL_VXHS:
|
||||
case VIR_STORAGE_NET_PROTOCOL_NFS:
|
||||
case VIR_STORAGE_NET_PROTOCOL_SSH:
|
||||
@@ -2626,6 +2665,12 @@ qemuBlockStorageSourceCreateGetStorageProps(virStorageSource *src,
|
||||
return -1;
|
||||
break;
|
||||
|
||||
+ case VIR_STORAGE_NET_PROTOCOL_VITASTOR:
|
||||
+ driver = "vitastor";
|
||||
+ if (!(location = qemuBlockStorageSourceGetVitastorProps(src)))
|
||||
+ return -1;
|
||||
+ break;
|
||||
+
|
||||
case VIR_STORAGE_NET_PROTOCOL_SHEEPDOG:
|
||||
driver = "sheepdog";
|
||||
if (!(location = qemuBlockStorageSourceGetSheepdogProps(src)))
|
||||
diff --git a/src/qemu/qemu_command.c b/src/qemu/qemu_command.c
|
||||
index d822533ccb..afe2087303 100644
|
||||
--- a/src/qemu/qemu_command.c
|
||||
+++ b/src/qemu/qemu_command.c
|
||||
@@ -1723,6 +1723,43 @@ qemuBuildNetworkDriveStr(virStorageSource *src,
|
||||
ret = virBufferContentAndReset(&buf);
|
||||
break;
|
||||
|
||||
+ case VIR_STORAGE_NET_PROTOCOL_VITASTOR:
|
||||
+ if (strchr(src->path, ':')) {
|
||||
+ virReportError(VIR_ERR_CONFIG_UNSUPPORTED,
|
||||
+ _("':' not allowed in Vitastor source volume name '%s'"),
|
||||
+ src->path);
|
||||
+ return NULL;
|
||||
+ }
|
||||
+
|
||||
+ virBufferStrcat(&buf, "vitastor:image=", src->path, NULL);
|
||||
+
|
||||
+ if (src->nhosts > 0) {
|
||||
+ virBufferAddLit(&buf, ":etcd-host=");
|
||||
+ for (i = 0; i < src->nhosts; i++) {
|
||||
+ if (i)
|
||||
+ virBufferAddLit(&buf, ",");
|
||||
+
|
||||
+ /* assume host containing : is ipv6 */
|
||||
+ if (strchr(src->hosts[i].name, ':'))
|
||||
+ virBufferEscape(&buf, '\\', ":", "[%s]",
|
||||
+ src->hosts[i].name);
|
||||
+ else
|
||||
+ virBufferAsprintf(&buf, "%s", src->hosts[i].name);
|
||||
+
|
||||
+ if (src->hosts[i].port)
|
||||
+ virBufferAsprintf(&buf, "\\:%u", src->hosts[i].port);
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
+ if (src->configFile)
|
||||
+ virBufferEscape(&buf, '\\', ":", ":config-path=%s", src->configFile);
|
||||
+
|
||||
+ if (src->query)
|
||||
+ virBufferEscape(&buf, '\\', ":", ":etcd-prefix=%s", src->query);
|
||||
+
|
||||
+ ret = virBufferContentAndReset(&buf);
|
||||
+ break;
|
||||
+
|
||||
case VIR_STORAGE_NET_PROTOCOL_VXHS:
|
||||
virReportError(VIR_ERR_INTERNAL_ERROR, "%s",
|
||||
_("VxHS protocol does not support URI syntax"));
|
||||
diff --git a/src/qemu/qemu_domain.c b/src/qemu/qemu_domain.c
|
||||
index a8401bac30..3dc1fe6db0 100644
|
||||
--- a/src/qemu/qemu_domain.c
|
||||
+++ b/src/qemu/qemu_domain.c
|
||||
@@ -4731,7 +4731,8 @@ qemuDomainValidateStorageSource(virStorageSource *src,
|
||||
if (src->query &&
|
||||
(actualType != VIR_STORAGE_TYPE_NETWORK ||
|
||||
(src->protocol != VIR_STORAGE_NET_PROTOCOL_HTTPS &&
|
||||
- src->protocol != VIR_STORAGE_NET_PROTOCOL_HTTP))) {
|
||||
+ src->protocol != VIR_STORAGE_NET_PROTOCOL_HTTP &&
|
||||
+ src->protocol != VIR_STORAGE_NET_PROTOCOL_VITASTOR))) {
|
||||
virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
|
||||
_("query is supported only with HTTP(S) protocols"));
|
||||
return -1;
|
||||
@@ -9919,6 +9920,7 @@ qemuDomainPrepareStorageSourceTLS(virStorageSource *src,
|
||||
break;
|
||||
|
||||
case VIR_STORAGE_NET_PROTOCOL_RBD:
|
||||
+ case VIR_STORAGE_NET_PROTOCOL_VITASTOR:
|
||||
case VIR_STORAGE_NET_PROTOCOL_SHEEPDOG:
|
||||
case VIR_STORAGE_NET_PROTOCOL_GLUSTER:
|
||||
case VIR_STORAGE_NET_PROTOCOL_ISCSI:
|
||||
diff --git a/src/qemu/qemu_snapshot.c b/src/qemu/qemu_snapshot.c
|
||||
index f92e00f9c0..854a3fbc90 100644
|
||||
--- a/src/qemu/qemu_snapshot.c
|
||||
+++ b/src/qemu/qemu_snapshot.c
|
||||
@@ -393,6 +393,7 @@ qemuSnapshotPrepareDiskExternalInactive(virDomainSnapshotDiskDef *snapdisk,
|
||||
case VIR_STORAGE_NET_PROTOCOL_NONE:
|
||||
case VIR_STORAGE_NET_PROTOCOL_NBD:
|
||||
case VIR_STORAGE_NET_PROTOCOL_RBD:
|
||||
+ case VIR_STORAGE_NET_PROTOCOL_VITASTOR:
|
||||
case VIR_STORAGE_NET_PROTOCOL_SHEEPDOG:
|
||||
case VIR_STORAGE_NET_PROTOCOL_GLUSTER:
|
||||
case VIR_STORAGE_NET_PROTOCOL_ISCSI:
|
||||
@@ -485,6 +486,7 @@ qemuSnapshotPrepareDiskExternalActive(virDomainObj *vm,
|
||||
case VIR_STORAGE_NET_PROTOCOL_NONE:
|
||||
case VIR_STORAGE_NET_PROTOCOL_NBD:
|
||||
case VIR_STORAGE_NET_PROTOCOL_RBD:
|
||||
+ case VIR_STORAGE_NET_PROTOCOL_VITASTOR:
|
||||
case VIR_STORAGE_NET_PROTOCOL_SHEEPDOG:
|
||||
case VIR_STORAGE_NET_PROTOCOL_ISCSI:
|
||||
case VIR_STORAGE_NET_PROTOCOL_HTTP:
|
||||
@@ -638,6 +640,7 @@ qemuSnapshotPrepareDiskInternal(virDomainDiskDef *disk,
|
||||
case VIR_STORAGE_NET_PROTOCOL_NONE:
|
||||
case VIR_STORAGE_NET_PROTOCOL_NBD:
|
||||
case VIR_STORAGE_NET_PROTOCOL_RBD:
|
||||
+ case VIR_STORAGE_NET_PROTOCOL_VITASTOR:
|
||||
case VIR_STORAGE_NET_PROTOCOL_SHEEPDOG:
|
||||
case VIR_STORAGE_NET_PROTOCOL_GLUSTER:
|
||||
case VIR_STORAGE_NET_PROTOCOL_ISCSI:
|
||||
diff --git a/src/storage/storage_driver.c b/src/storage/storage_driver.c
|
||||
index 4df2c75a2b..5a5e48ef71 100644
|
||||
--- a/src/storage/storage_driver.c
|
||||
+++ b/src/storage/storage_driver.c
|
||||
@@ -1643,6 +1643,7 @@ storageVolLookupByPathCallback(virStoragePoolObj *obj,
|
||||
|
||||
case VIR_STORAGE_POOL_GLUSTER:
|
||||
case VIR_STORAGE_POOL_RBD:
|
||||
+ case VIR_STORAGE_POOL_VITASTOR:
|
||||
case VIR_STORAGE_POOL_SHEEPDOG:
|
||||
case VIR_STORAGE_POOL_ZFS:
|
||||
case VIR_STORAGE_POOL_LAST:
|
||||
diff --git a/src/storage_file/storage_source_backingstore.c b/src/storage_file/storage_source_backingstore.c
|
||||
index e48ae725ab..2017ccc88c 100644
|
||||
--- a/src/storage_file/storage_source_backingstore.c
|
||||
+++ b/src/storage_file/storage_source_backingstore.c
|
||||
@@ -284,6 +284,75 @@ virStorageSourceParseRBDColonString(const char *rbdstr,
|
||||
}
|
||||
|
||||
|
||||
+static int
|
||||
+virStorageSourceParseVitastorColonString(const char *colonstr,
|
||||
+ virStorageSource *src)
|
||||
+{
|
||||
+ char *p, *e, *next;
|
||||
+ g_autofree char *options = NULL;
|
||||
+
|
||||
+ /* optionally skip the "vitastor:" prefix if provided */
|
||||
+ if (STRPREFIX(colonstr, "vitastor:"))
|
||||
+ colonstr += strlen("vitastor:");
|
||||
+
|
||||
+ options = g_strdup(colonstr);
|
||||
+
|
||||
+ p = options;
|
||||
+ while (*p) {
|
||||
+ /* find : delimiter or end of string */
|
||||
+ for (e = p; *e && *e != ':'; ++e) {
|
||||
+ if (*e == '\\') {
|
||||
+ e++;
|
||||
+ if (*e == '\0')
|
||||
+ break;
|
||||
+ }
|
||||
+ }
|
||||
+ if (*e == '\0') {
|
||||
+ next = e; /* last kv pair */
|
||||
+ } else {
|
||||
+ next = e + 1;
|
||||
+ *e = '\0';
|
||||
+ }
|
||||
+
|
||||
+ if (STRPREFIX(p, "image=")) {
|
||||
+ src->path = g_strdup(p + strlen("image="));
|
||||
+ } else if (STRPREFIX(p, "etcd-prefix=")) {
|
||||
+ src->query = g_strdup(p + strlen("etcd-prefix="));
|
||||
+ } else if (STRPREFIX(p, "config-path=")) {
|
||||
+ src->configFile = g_strdup(p + strlen("config-path="));
|
||||
+ } else if (STRPREFIX(p, "etcd-host=")) {
|
||||
+ char *h, *sep;
|
||||
+
|
||||
+ h = p + strlen("etcd-host=");
|
||||
+ while (h < e) {
|
||||
+ for (sep = h; sep < e; ++sep) {
|
||||
+ if (*sep == '\\' && (sep[1] == ',' ||
|
||||
+ sep[1] == ';' ||
|
||||
+ sep[1] == ' ')) {
|
||||
+ *sep = '\0';
|
||||
+ sep += 2;
|
||||
+ break;
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
+ if (virStorageSourceRBDAddHost(src, h) < 0)
|
||||
+ return -1;
|
||||
+
|
||||
+ h = sep;
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
+ p = next;
|
||||
+ }
|
||||
+
|
||||
+ if (!src->path) {
|
||||
+ return -1;
|
||||
+ }
|
||||
+
|
||||
+ return 0;
|
||||
+}
|
||||
+
|
||||
+
|
||||
static int
|
||||
virStorageSourceParseNBDColonString(const char *nbdstr,
|
||||
virStorageSource *src)
|
||||
@@ -396,6 +465,11 @@ virStorageSourceParseBackingColon(virStorageSource *src,
|
||||
return -1;
|
||||
break;
|
||||
|
||||
+ case VIR_STORAGE_NET_PROTOCOL_VITASTOR:
|
||||
+ if (virStorageSourceParseVitastorColonString(path, src) < 0)
|
||||
+ return -1;
|
||||
+ break;
|
||||
+
|
||||
case VIR_STORAGE_NET_PROTOCOL_SHEEPDOG:
|
||||
case VIR_STORAGE_NET_PROTOCOL_LAST:
|
||||
case VIR_STORAGE_NET_PROTOCOL_NONE:
|
||||
@@ -984,6 +1058,54 @@ virStorageSourceParseBackingJSONRBD(virStorageSource *src,
|
||||
return 0;
|
||||
}
|
||||
|
||||
+static int
|
||||
+virStorageSourceParseBackingJSONVitastor(virStorageSource *src,
|
||||
+ virJSONValue *json,
|
||||
+ const char *jsonstr G_GNUC_UNUSED,
|
||||
+ int opaque G_GNUC_UNUSED)
|
||||
+{
|
||||
+ const char *filename;
|
||||
+ const char *image = virJSONValueObjectGetString(json, "image");
|
||||
+ const char *conf = virJSONValueObjectGetString(json, "config-path");
|
||||
+ const char *etcd_prefix = virJSONValueObjectGetString(json, "etcd-prefix");
|
||||
+ virJSONValue *servers = virJSONValueObjectGetArray(json, "server");
|
||||
+ size_t nservers;
|
||||
+ size_t i;
|
||||
+
|
||||
+ src->type = VIR_STORAGE_TYPE_NETWORK;
|
||||
+ src->protocol = VIR_STORAGE_NET_PROTOCOL_VITASTOR;
|
||||
+
|
||||
+ /* legacy syntax passed via 'filename' option */
|
||||
+ if ((filename = virJSONValueObjectGetString(json, "filename")))
|
||||
+ return virStorageSourceParseVitastorColonString(filename, src);
|
||||
+
|
||||
+ if (!image) {
|
||||
+ virReportError(VIR_ERR_INVALID_ARG, "%s",
|
||||
+ _("missing image name in Vitastor backing volume "
|
||||
+ "JSON specification"));
|
||||
+ return -1;
|
||||
+ }
|
||||
+
|
||||
+ src->path = g_strdup(image);
|
||||
+ src->configFile = g_strdup(conf);
|
||||
+ src->query = g_strdup(etcd_prefix);
|
||||
+
|
||||
+ if (servers) {
|
||||
+ nservers = virJSONValueArraySize(servers);
|
||||
+
|
||||
+ src->hosts = g_new0(virStorageNetHostDef, nservers);
|
||||
+ src->nhosts = nservers;
|
||||
+
|
||||
+ for (i = 0; i < nservers; i++) {
|
||||
+ if (virStorageSourceParseBackingJSONInetSocketAddress(src->hosts + i,
|
||||
+ virJSONValueArrayGet(servers, i)) < 0)
|
||||
+ return -1;
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
+ return 0;
|
||||
+}
|
||||
+
|
||||
static int
|
||||
virStorageSourceParseBackingJSONRaw(virStorageSource *src,
|
||||
virJSONValue *json,
|
||||
@@ -1162,6 +1284,7 @@ static const struct virStorageSourceJSONDriverParser jsonParsers[] = {
|
||||
{"sheepdog", false, virStorageSourceParseBackingJSONSheepdog, 0},
|
||||
{"ssh", false, virStorageSourceParseBackingJSONSSH, 0},
|
||||
{"rbd", false, virStorageSourceParseBackingJSONRBD, 0},
|
||||
+ {"vitastor", false, virStorageSourceParseBackingJSONVitastor, 0},
|
||||
{"raw", true, virStorageSourceParseBackingJSONRaw, 0},
|
||||
{"nfs", false, virStorageSourceParseBackingJSONNFS, 0},
|
||||
{"vxhs", false, virStorageSourceParseBackingJSONVxHS, 0},
|
||||
diff --git a/src/test/test_driver.c b/src/test/test_driver.c
|
||||
index 0e93b79922..b4d33f5f56 100644
|
||||
--- a/src/test/test_driver.c
|
||||
+++ b/src/test/test_driver.c
|
||||
@@ -7367,6 +7367,7 @@ testStorageVolumeTypeForPool(int pooltype)
|
||||
case VIR_STORAGE_POOL_ISCSI_DIRECT:
|
||||
case VIR_STORAGE_POOL_GLUSTER:
|
||||
case VIR_STORAGE_POOL_RBD:
|
||||
+ case VIR_STORAGE_POOL_VITASTOR:
|
||||
return VIR_STORAGE_VOL_NETWORK;
|
||||
case VIR_STORAGE_POOL_LOGICAL:
|
||||
case VIR_STORAGE_POOL_DISK:
|
||||
diff --git a/tests/storagepoolcapsschemadata/poolcaps-fs.xml b/tests/storagepoolcapsschemadata/poolcaps-fs.xml
|
||||
index eee75af746..8bd0a57bdd 100644
|
||||
--- a/tests/storagepoolcapsschemadata/poolcaps-fs.xml
|
||||
+++ b/tests/storagepoolcapsschemadata/poolcaps-fs.xml
|
||||
@@ -204,4 +204,11 @@
|
||||
</enum>
|
||||
</volOptions>
|
||||
</pool>
|
||||
+ <pool type='vitastor' supported='no'>
|
||||
+ <volOptions>
|
||||
+ <defaultFormat type='raw'/>
|
||||
+ <enum name='targetFormatType'>
|
||||
+ </enum>
|
||||
+ </volOptions>
|
||||
+ </pool>
|
||||
</storagepoolCapabilities>
|
||||
diff --git a/tests/storagepoolcapsschemadata/poolcaps-full.xml b/tests/storagepoolcapsschemadata/poolcaps-full.xml
|
||||
index 805950a937..852df0de16 100644
|
||||
--- a/tests/storagepoolcapsschemadata/poolcaps-full.xml
|
||||
+++ b/tests/storagepoolcapsschemadata/poolcaps-full.xml
|
||||
@@ -204,4 +204,11 @@
|
||||
</enum>
|
||||
</volOptions>
|
||||
</pool>
|
||||
+ <pool type='vitastor' supported='yes'>
|
||||
+ <volOptions>
|
||||
+ <defaultFormat type='raw'/>
|
||||
+ <enum name='targetFormatType'>
|
||||
+ </enum>
|
||||
+ </volOptions>
|
||||
+ </pool>
|
||||
</storagepoolCapabilities>
|
||||
diff --git a/tests/storagepoolxml2argvtest.c b/tests/storagepoolxml2argvtest.c
|
||||
index 449b745519..7f95cc8e08 100644
|
||||
--- a/tests/storagepoolxml2argvtest.c
|
||||
+++ b/tests/storagepoolxml2argvtest.c
|
||||
@@ -68,6 +68,7 @@ testCompareXMLToArgvFiles(bool shouldFail,
|
||||
case VIR_STORAGE_POOL_GLUSTER:
|
||||
case VIR_STORAGE_POOL_ZFS:
|
||||
case VIR_STORAGE_POOL_VSTORAGE:
|
||||
+ case VIR_STORAGE_POOL_VITASTOR:
|
||||
case VIR_STORAGE_POOL_LAST:
|
||||
default:
|
||||
VIR_TEST_DEBUG("pool type '%s' has no xml2argv test", defTypeStr);
|
||||
diff --git a/tools/virsh-pool.c b/tools/virsh-pool.c
|
||||
index d391257f6e..46799c4a90 100644
|
||||
--- a/tools/virsh-pool.c
|
||||
+++ b/tools/virsh-pool.c
|
||||
@@ -1213,6 +1213,9 @@ cmdPoolList(vshControl *ctl, const vshCmd *cmd G_GNUC_UNUSED)
|
||||
case VIR_STORAGE_POOL_VSTORAGE:
|
||||
flags |= VIR_CONNECT_LIST_STORAGE_POOLS_VSTORAGE;
|
||||
break;
|
||||
+ case VIR_STORAGE_POOL_VITASTOR:
|
||||
+ flags |= VIR_CONNECT_LIST_STORAGE_POOLS_VITASTOR;
|
||||
+ break;
|
||||
case VIR_STORAGE_POOL_LAST:
|
||||
break;
|
||||
}
|
643
patches/libvirt-9.10-vitastor.diff
Normal file
643
patches/libvirt-9.10-vitastor.diff
Normal file
@@ -0,0 +1,643 @@
|
||||
commit c1cd026e211e94b120028e7c98a6e4ce5afe9846
|
||||
Author: Vitaliy Filippov <vitalif@yourcmc.ru>
|
||||
Date: Wed Jan 24 22:04:50 2024 +0300
|
||||
|
||||
Add Vitastor support
|
||||
|
||||
diff --git a/include/libvirt/libvirt-storage.h b/include/libvirt/libvirt-storage.h
|
||||
index aaad4a3da1..5f5daa8341 100644
|
||||
--- a/include/libvirt/libvirt-storage.h
|
||||
+++ b/include/libvirt/libvirt-storage.h
|
||||
@@ -326,6 +326,7 @@ typedef enum {
|
||||
VIR_CONNECT_LIST_STORAGE_POOLS_ZFS = 1 << 17, /* (Since: 1.2.8) */
|
||||
VIR_CONNECT_LIST_STORAGE_POOLS_VSTORAGE = 1 << 18, /* (Since: 3.1.0) */
|
||||
VIR_CONNECT_LIST_STORAGE_POOLS_ISCSI_DIRECT = 1 << 19, /* (Since: 5.6.0) */
|
||||
+ VIR_CONNECT_LIST_STORAGE_POOLS_VITASTOR = 1 << 20, /* (Since: 5.0.0) */
|
||||
} virConnectListAllStoragePoolsFlags;
|
||||
|
||||
int virConnectListAllStoragePools(virConnectPtr conn,
|
||||
diff --git a/src/conf/domain_conf.c b/src/conf/domain_conf.c
|
||||
index 22ad43e1d7..56c81d6852 100644
|
||||
--- a/src/conf/domain_conf.c
|
||||
+++ b/src/conf/domain_conf.c
|
||||
@@ -7185,7 +7185,8 @@ virDomainDiskSourceNetworkParse(xmlNodePtr node,
|
||||
src->configFile = virXPathString("string(./config/@file)", ctxt);
|
||||
|
||||
if (src->protocol == VIR_STORAGE_NET_PROTOCOL_HTTP ||
|
||||
- src->protocol == VIR_STORAGE_NET_PROTOCOL_HTTPS)
|
||||
+ src->protocol == VIR_STORAGE_NET_PROTOCOL_HTTPS ||
|
||||
+ src->protocol == VIR_STORAGE_NET_PROTOCOL_VITASTOR)
|
||||
src->query = virXMLPropString(node, "query");
|
||||
|
||||
if (virDomainStorageNetworkParseHosts(node, ctxt, &src->hosts, &src->nhosts) < 0)
|
||||
@@ -30618,6 +30619,7 @@ virDomainStorageSourceTranslateSourcePool(virStorageSource *src,
|
||||
|
||||
case VIR_STORAGE_POOL_MPATH:
|
||||
case VIR_STORAGE_POOL_RBD:
|
||||
+ case VIR_STORAGE_POOL_VITASTOR:
|
||||
case VIR_STORAGE_POOL_SHEEPDOG:
|
||||
case VIR_STORAGE_POOL_GLUSTER:
|
||||
case VIR_STORAGE_POOL_LAST:
|
||||
diff --git a/src/conf/domain_validate.c b/src/conf/domain_validate.c
|
||||
index c72108886e..c739ed6c43 100644
|
||||
--- a/src/conf/domain_validate.c
|
||||
+++ b/src/conf/domain_validate.c
|
||||
@@ -495,6 +495,7 @@ virDomainDiskDefValidateSourceChainOne(const virStorageSource *src)
|
||||
case VIR_STORAGE_NET_PROTOCOL_RBD:
|
||||
break;
|
||||
|
||||
+ case VIR_STORAGE_NET_PROTOCOL_VITASTOR:
|
||||
case VIR_STORAGE_NET_PROTOCOL_NBD:
|
||||
case VIR_STORAGE_NET_PROTOCOL_SHEEPDOG:
|
||||
case VIR_STORAGE_NET_PROTOCOL_GLUSTER:
|
||||
@@ -541,7 +542,7 @@ virDomainDiskDefValidateSourceChainOne(const virStorageSource *src)
|
||||
}
|
||||
}
|
||||
|
||||
- /* internal snapshots and config files are currently supported only with rbd: */
|
||||
+ /* internal snapshots are currently supported only with rbd: */
|
||||
if (virStorageSourceGetActualType(src) != VIR_STORAGE_TYPE_NETWORK &&
|
||||
src->protocol != VIR_STORAGE_NET_PROTOCOL_RBD) {
|
||||
if (src->snapshot) {
|
||||
@@ -549,10 +550,15 @@ virDomainDiskDefValidateSourceChainOne(const virStorageSource *src)
|
||||
_("<snapshot> element is currently supported only with 'rbd' disks"));
|
||||
return -1;
|
||||
}
|
||||
+ }
|
||||
|
||||
+ /* config files are currently supported only with rbd and vitastor: */
|
||||
+ if (virStorageSourceGetActualType(src) != VIR_STORAGE_TYPE_NETWORK &&
|
||||
+ src->protocol != VIR_STORAGE_NET_PROTOCOL_RBD &&
|
||||
+ src->protocol != VIR_STORAGE_NET_PROTOCOL_VITASTOR) {
|
||||
if (src->configFile) {
|
||||
virReportError(VIR_ERR_XML_ERROR, "%s",
|
||||
- _("<config> element is currently supported only with 'rbd' disks"));
|
||||
+ _("<config> element is currently supported only with 'rbd' and 'vitastor' disks"));
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
diff --git a/src/conf/schemas/domaincommon.rng b/src/conf/schemas/domaincommon.rng
|
||||
index b98a2ae602..7d7a872e01 100644
|
||||
--- a/src/conf/schemas/domaincommon.rng
|
||||
+++ b/src/conf/schemas/domaincommon.rng
|
||||
@@ -1997,6 +1997,35 @@
|
||||
</element>
|
||||
</define>
|
||||
|
||||
+ <define name="diskSourceNetworkProtocolVitastor">
|
||||
+ <element name="source">
|
||||
+ <interleave>
|
||||
+ <attribute name="protocol">
|
||||
+ <value>vitastor</value>
|
||||
+ </attribute>
|
||||
+ <ref name="diskSourceCommon"/>
|
||||
+ <optional>
|
||||
+ <attribute name="name"/>
|
||||
+ </optional>
|
||||
+ <optional>
|
||||
+ <attribute name="query"/>
|
||||
+ </optional>
|
||||
+ <zeroOrMore>
|
||||
+ <ref name="diskSourceNetworkHost"/>
|
||||
+ </zeroOrMore>
|
||||
+ <optional>
|
||||
+ <element name="config">
|
||||
+ <attribute name="file">
|
||||
+ <ref name="absFilePath"/>
|
||||
+ </attribute>
|
||||
+ <empty/>
|
||||
+ </element>
|
||||
+ </optional>
|
||||
+ <empty/>
|
||||
+ </interleave>
|
||||
+ </element>
|
||||
+ </define>
|
||||
+
|
||||
<define name="diskSourceNetworkProtocolISCSI">
|
||||
<element name="source">
|
||||
<attribute name="protocol">
|
||||
@@ -2347,6 +2376,7 @@
|
||||
<ref name="diskSourceNetworkProtocolSimple"/>
|
||||
<ref name="diskSourceNetworkProtocolVxHS"/>
|
||||
<ref name="diskSourceNetworkProtocolNFS"/>
|
||||
+ <ref name="diskSourceNetworkProtocolVitastor"/>
|
||||
</choice>
|
||||
</define>
|
||||
|
||||
diff --git a/src/conf/storage_conf.c b/src/conf/storage_conf.c
|
||||
index 68842004b7..1d69a788b6 100644
|
||||
--- a/src/conf/storage_conf.c
|
||||
+++ b/src/conf/storage_conf.c
|
||||
@@ -56,7 +56,7 @@ VIR_ENUM_IMPL(virStoragePool,
|
||||
"logical", "disk", "iscsi",
|
||||
"iscsi-direct", "scsi", "mpath",
|
||||
"rbd", "sheepdog", "gluster",
|
||||
- "zfs", "vstorage",
|
||||
+ "zfs", "vstorage", "vitastor",
|
||||
);
|
||||
|
||||
VIR_ENUM_IMPL(virStoragePoolFormatFileSystem,
|
||||
@@ -242,6 +242,18 @@ static virStoragePoolTypeInfo poolTypeInfo[] = {
|
||||
.formatToString = virStorageFileFormatTypeToString,
|
||||
}
|
||||
},
|
||||
+ {.poolType = VIR_STORAGE_POOL_VITASTOR,
|
||||
+ .poolOptions = {
|
||||
+ .flags = (VIR_STORAGE_POOL_SOURCE_HOST |
|
||||
+ VIR_STORAGE_POOL_SOURCE_NETWORK |
|
||||
+ VIR_STORAGE_POOL_SOURCE_NAME),
|
||||
+ },
|
||||
+ .volOptions = {
|
||||
+ .defaultFormat = VIR_STORAGE_FILE_RAW,
|
||||
+ .formatFromString = virStorageVolumeFormatFromString,
|
||||
+ .formatToString = virStorageFileFormatTypeToString,
|
||||
+ }
|
||||
+ },
|
||||
{.poolType = VIR_STORAGE_POOL_SHEEPDOG,
|
||||
.poolOptions = {
|
||||
.flags = (VIR_STORAGE_POOL_SOURCE_HOST |
|
||||
@@ -538,6 +550,11 @@ virStoragePoolDefParseSource(xmlXPathContextPtr ctxt,
|
||||
_("element 'name' is mandatory for RBD pool"));
|
||||
return -1;
|
||||
}
|
||||
+ if (pool_type == VIR_STORAGE_POOL_VITASTOR && source->name == NULL) {
|
||||
+ virReportError(VIR_ERR_XML_ERROR, "%s",
|
||||
+ _("element 'name' is mandatory for Vitastor pool"));
|
||||
+ return -1;
|
||||
+ }
|
||||
|
||||
if (options->formatFromString) {
|
||||
g_autofree char *format = NULL;
|
||||
@@ -1127,6 +1144,7 @@ virStoragePoolDefFormatBuf(virBuffer *buf,
|
||||
/* RBD, Sheepdog, Gluster and Iscsi-direct devices are not local block devs nor
|
||||
* files, so they don't have a target */
|
||||
if (def->type != VIR_STORAGE_POOL_RBD &&
|
||||
+ def->type != VIR_STORAGE_POOL_VITASTOR &&
|
||||
def->type != VIR_STORAGE_POOL_SHEEPDOG &&
|
||||
def->type != VIR_STORAGE_POOL_GLUSTER &&
|
||||
def->type != VIR_STORAGE_POOL_ISCSI_DIRECT) {
|
||||
diff --git a/src/conf/storage_conf.h b/src/conf/storage_conf.h
|
||||
index fc67957cfe..720c07ef74 100644
|
||||
--- a/src/conf/storage_conf.h
|
||||
+++ b/src/conf/storage_conf.h
|
||||
@@ -103,6 +103,7 @@ typedef enum {
|
||||
VIR_STORAGE_POOL_GLUSTER, /* Gluster device */
|
||||
VIR_STORAGE_POOL_ZFS, /* ZFS */
|
||||
VIR_STORAGE_POOL_VSTORAGE, /* Virtuozzo Storage */
|
||||
+ VIR_STORAGE_POOL_VITASTOR, /* Vitastor */
|
||||
|
||||
VIR_STORAGE_POOL_LAST,
|
||||
} virStoragePoolType;
|
||||
@@ -454,6 +455,7 @@ VIR_ENUM_DECL(virStoragePartedFs);
|
||||
VIR_CONNECT_LIST_STORAGE_POOLS_SCSI | \
|
||||
VIR_CONNECT_LIST_STORAGE_POOLS_MPATH | \
|
||||
VIR_CONNECT_LIST_STORAGE_POOLS_RBD | \
|
||||
+ VIR_CONNECT_LIST_STORAGE_POOLS_VITASTOR | \
|
||||
VIR_CONNECT_LIST_STORAGE_POOLS_SHEEPDOG | \
|
||||
VIR_CONNECT_LIST_STORAGE_POOLS_GLUSTER | \
|
||||
VIR_CONNECT_LIST_STORAGE_POOLS_ZFS | \
|
||||
diff --git a/src/conf/storage_source_conf.c b/src/conf/storage_source_conf.c
|
||||
index f974a521b1..cd394d0a9f 100644
|
||||
--- a/src/conf/storage_source_conf.c
|
||||
+++ b/src/conf/storage_source_conf.c
|
||||
@@ -88,6 +88,7 @@ VIR_ENUM_IMPL(virStorageNetProtocol,
|
||||
"ssh",
|
||||
"vxhs",
|
||||
"nfs",
|
||||
+ "vitastor",
|
||||
);
|
||||
|
||||
|
||||
@@ -1301,6 +1302,7 @@ virStorageSourceNetworkDefaultPort(virStorageNetProtocol protocol)
|
||||
case VIR_STORAGE_NET_PROTOCOL_GLUSTER:
|
||||
return 24007;
|
||||
|
||||
+ case VIR_STORAGE_NET_PROTOCOL_VITASTOR:
|
||||
case VIR_STORAGE_NET_PROTOCOL_RBD:
|
||||
/* we don't provide a default for RBD */
|
||||
return 0;
|
||||
diff --git a/src/conf/storage_source_conf.h b/src/conf/storage_source_conf.h
|
||||
index 5e7d127453..283709eeb3 100644
|
||||
--- a/src/conf/storage_source_conf.h
|
||||
+++ b/src/conf/storage_source_conf.h
|
||||
@@ -129,6 +129,7 @@ typedef enum {
|
||||
VIR_STORAGE_NET_PROTOCOL_SSH,
|
||||
VIR_STORAGE_NET_PROTOCOL_VXHS,
|
||||
VIR_STORAGE_NET_PROTOCOL_NFS,
|
||||
+ VIR_STORAGE_NET_PROTOCOL_VITASTOR,
|
||||
|
||||
VIR_STORAGE_NET_PROTOCOL_LAST
|
||||
} virStorageNetProtocol;
|
||||
diff --git a/src/conf/virstorageobj.c b/src/conf/virstorageobj.c
|
||||
index 59fa5da372..4739167f5f 100644
|
||||
--- a/src/conf/virstorageobj.c
|
||||
+++ b/src/conf/virstorageobj.c
|
||||
@@ -1438,6 +1438,7 @@ virStoragePoolObjSourceFindDuplicateCb(const void *payload,
|
||||
return 1;
|
||||
break;
|
||||
|
||||
+ case VIR_STORAGE_POOL_VITASTOR:
|
||||
case VIR_STORAGE_POOL_ISCSI_DIRECT:
|
||||
case VIR_STORAGE_POOL_RBD:
|
||||
case VIR_STORAGE_POOL_LAST:
|
||||
@@ -1921,6 +1922,8 @@ virStoragePoolObjMatch(virStoragePoolObj *obj,
|
||||
(obj->def->type == VIR_STORAGE_POOL_MPATH)) ||
|
||||
(MATCH(VIR_CONNECT_LIST_STORAGE_POOLS_RBD) &&
|
||||
(obj->def->type == VIR_STORAGE_POOL_RBD)) ||
|
||||
+ (MATCH(VIR_CONNECT_LIST_STORAGE_POOLS_VITASTOR) &&
|
||||
+ (obj->def->type == VIR_STORAGE_POOL_VITASTOR)) ||
|
||||
(MATCH(VIR_CONNECT_LIST_STORAGE_POOLS_SHEEPDOG) &&
|
||||
(obj->def->type == VIR_STORAGE_POOL_SHEEPDOG)) ||
|
||||
(MATCH(VIR_CONNECT_LIST_STORAGE_POOLS_GLUSTER) &&
|
||||
diff --git a/src/libvirt-storage.c b/src/libvirt-storage.c
|
||||
index db7660aac4..561df34709 100644
|
||||
--- a/src/libvirt-storage.c
|
||||
+++ b/src/libvirt-storage.c
|
||||
@@ -94,6 +94,7 @@ virStoragePoolGetConnect(virStoragePoolPtr pool)
|
||||
* VIR_CONNECT_LIST_STORAGE_POOLS_SCSI
|
||||
* VIR_CONNECT_LIST_STORAGE_POOLS_MPATH
|
||||
* VIR_CONNECT_LIST_STORAGE_POOLS_RBD
|
||||
+ * VIR_CONNECT_LIST_STORAGE_POOLS_VITASTOR
|
||||
* VIR_CONNECT_LIST_STORAGE_POOLS_SHEEPDOG
|
||||
* VIR_CONNECT_LIST_STORAGE_POOLS_GLUSTER
|
||||
* VIR_CONNECT_LIST_STORAGE_POOLS_ZFS
|
||||
diff --git a/src/libxl/libxl_conf.c b/src/libxl/libxl_conf.c
|
||||
index 62e1be6672..71a1d42896 100644
|
||||
--- a/src/libxl/libxl_conf.c
|
||||
+++ b/src/libxl/libxl_conf.c
|
||||
@@ -979,6 +979,7 @@ libxlMakeNetworkDiskSrcStr(virStorageSource *src,
|
||||
case VIR_STORAGE_NET_PROTOCOL_SSH:
|
||||
case VIR_STORAGE_NET_PROTOCOL_VXHS:
|
||||
case VIR_STORAGE_NET_PROTOCOL_NFS:
|
||||
+ case VIR_STORAGE_NET_PROTOCOL_VITASTOR:
|
||||
case VIR_STORAGE_NET_PROTOCOL_LAST:
|
||||
case VIR_STORAGE_NET_PROTOCOL_NONE:
|
||||
virReportError(VIR_ERR_NO_SUPPORT,
|
||||
diff --git a/src/libxl/xen_xl.c b/src/libxl/xen_xl.c
|
||||
index f175359307..8efcf4c329 100644
|
||||
--- a/src/libxl/xen_xl.c
|
||||
+++ b/src/libxl/xen_xl.c
|
||||
@@ -1456,6 +1456,7 @@ xenFormatXLDiskSrcNet(virStorageSource *src)
|
||||
case VIR_STORAGE_NET_PROTOCOL_SSH:
|
||||
case VIR_STORAGE_NET_PROTOCOL_VXHS:
|
||||
case VIR_STORAGE_NET_PROTOCOL_NFS:
|
||||
+ case VIR_STORAGE_NET_PROTOCOL_VITASTOR:
|
||||
case VIR_STORAGE_NET_PROTOCOL_LAST:
|
||||
case VIR_STORAGE_NET_PROTOCOL_NONE:
|
||||
virReportError(VIR_ERR_NO_SUPPORT,
|
||||
diff --git a/src/qemu/qemu_block.c b/src/qemu/qemu_block.c
|
||||
index 7e9daf0bdc..825b4a3006 100644
|
||||
--- a/src/qemu/qemu_block.c
|
||||
+++ b/src/qemu/qemu_block.c
|
||||
@@ -758,6 +758,38 @@ qemuBlockStorageSourceGetRBDProps(virStorageSource *src,
|
||||
}
|
||||
|
||||
|
||||
+static virJSONValue *
|
||||
+qemuBlockStorageSourceGetVitastorProps(virStorageSource *src)
|
||||
+{
|
||||
+ virJSONValue *ret = NULL;
|
||||
+ virStorageNetHostDef *host;
|
||||
+ size_t i;
|
||||
+ g_auto(virBuffer) buf = VIR_BUFFER_INITIALIZER;
|
||||
+ g_autofree char *etcd = NULL;
|
||||
+
|
||||
+ for (i = 0; i < src->nhosts; i++) {
|
||||
+ host = src->hosts + i;
|
||||
+ if ((virStorageNetHostTransport)host->transport != VIR_STORAGE_NET_HOST_TRANS_TCP) {
|
||||
+ return NULL;
|
||||
+ }
|
||||
+ virBufferAsprintf(&buf, i > 0 ? ",%s:%u" : "%s:%u", host->name, host->port);
|
||||
+ }
|
||||
+ if (src->nhosts > 0) {
|
||||
+ etcd = virBufferContentAndReset(&buf);
|
||||
+ }
|
||||
+
|
||||
+ if (virJSONValueObjectAdd(&ret,
|
||||
+ "S:etcd-host", etcd,
|
||||
+ "S:etcd-prefix", src->query,
|
||||
+ "S:config-path", src->configFile,
|
||||
+ "s:image", src->path,
|
||||
+ NULL) < 0)
|
||||
+ return NULL;
|
||||
+
|
||||
+ return ret;
|
||||
+}
|
||||
+
|
||||
+
|
||||
static virJSONValue *
|
||||
qemuBlockStorageSourceGetSheepdogProps(virStorageSource *src)
|
||||
{
|
||||
@@ -1140,6 +1172,12 @@ qemuBlockStorageSourceGetBackendProps(virStorageSource *src,
|
||||
return NULL;
|
||||
break;
|
||||
|
||||
+ case VIR_STORAGE_NET_PROTOCOL_VITASTOR:
|
||||
+ driver = "vitastor";
|
||||
+ if (!(fileprops = qemuBlockStorageSourceGetVitastorProps(src)))
|
||||
+ return NULL;
|
||||
+ break;
|
||||
+
|
||||
case VIR_STORAGE_NET_PROTOCOL_SHEEPDOG:
|
||||
driver = "sheepdog";
|
||||
if (!(fileprops = qemuBlockStorageSourceGetSheepdogProps(src)))
|
||||
@@ -2032,6 +2070,7 @@ qemuBlockGetBackingStoreString(virStorageSource *src,
|
||||
|
||||
case VIR_STORAGE_NET_PROTOCOL_SHEEPDOG:
|
||||
case VIR_STORAGE_NET_PROTOCOL_RBD:
|
||||
+ case VIR_STORAGE_NET_PROTOCOL_VITASTOR:
|
||||
case VIR_STORAGE_NET_PROTOCOL_VXHS:
|
||||
case VIR_STORAGE_NET_PROTOCOL_NFS:
|
||||
case VIR_STORAGE_NET_PROTOCOL_SSH:
|
||||
@@ -2415,6 +2454,12 @@ qemuBlockStorageSourceCreateGetStorageProps(virStorageSource *src,
|
||||
return -1;
|
||||
break;
|
||||
|
||||
+ case VIR_STORAGE_NET_PROTOCOL_VITASTOR:
|
||||
+ driver = "vitastor";
|
||||
+ if (!(location = qemuBlockStorageSourceGetVitastorProps(src)))
|
||||
+ return -1;
|
||||
+ break;
|
||||
+
|
||||
case VIR_STORAGE_NET_PROTOCOL_SHEEPDOG:
|
||||
driver = "sheepdog";
|
||||
if (!(location = qemuBlockStorageSourceGetSheepdogProps(src)))
|
||||
diff --git a/src/qemu/qemu_domain.c b/src/qemu/qemu_domain.c
|
||||
index 953808fcfe..62860283d8 100644
|
||||
--- a/src/qemu/qemu_domain.c
|
||||
+++ b/src/qemu/qemu_domain.c
|
||||
@@ -5215,7 +5215,8 @@ qemuDomainValidateStorageSource(virStorageSource *src,
|
||||
if (src->query &&
|
||||
(actualType != VIR_STORAGE_TYPE_NETWORK ||
|
||||
(src->protocol != VIR_STORAGE_NET_PROTOCOL_HTTPS &&
|
||||
- src->protocol != VIR_STORAGE_NET_PROTOCOL_HTTP))) {
|
||||
+ src->protocol != VIR_STORAGE_NET_PROTOCOL_HTTP &&
|
||||
+ src->protocol != VIR_STORAGE_NET_PROTOCOL_VITASTOR))) {
|
||||
virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
|
||||
_("query is supported only with HTTP(S) protocols"));
|
||||
return -1;
|
||||
@@ -10340,6 +10341,7 @@ qemuDomainPrepareStorageSourceTLS(virStorageSource *src,
|
||||
break;
|
||||
|
||||
case VIR_STORAGE_NET_PROTOCOL_RBD:
|
||||
+ case VIR_STORAGE_NET_PROTOCOL_VITASTOR:
|
||||
case VIR_STORAGE_NET_PROTOCOL_SHEEPDOG:
|
||||
case VIR_STORAGE_NET_PROTOCOL_GLUSTER:
|
||||
case VIR_STORAGE_NET_PROTOCOL_ISCSI:
|
||||
diff --git a/src/qemu/qemu_snapshot.c b/src/qemu/qemu_snapshot.c
|
||||
index 73ff533827..e9c799ca8f 100644
|
||||
--- a/src/qemu/qemu_snapshot.c
|
||||
+++ b/src/qemu/qemu_snapshot.c
|
||||
@@ -423,6 +423,7 @@ qemuSnapshotPrepareDiskExternalInactive(virDomainSnapshotDiskDef *snapdisk,
|
||||
case VIR_STORAGE_NET_PROTOCOL_NONE:
|
||||
case VIR_STORAGE_NET_PROTOCOL_NBD:
|
||||
case VIR_STORAGE_NET_PROTOCOL_RBD:
|
||||
+ case VIR_STORAGE_NET_PROTOCOL_VITASTOR:
|
||||
case VIR_STORAGE_NET_PROTOCOL_SHEEPDOG:
|
||||
case VIR_STORAGE_NET_PROTOCOL_GLUSTER:
|
||||
case VIR_STORAGE_NET_PROTOCOL_ISCSI:
|
||||
@@ -648,6 +649,7 @@ qemuSnapshotPrepareDiskInternal(virDomainDiskDef *disk,
|
||||
case VIR_STORAGE_NET_PROTOCOL_NONE:
|
||||
case VIR_STORAGE_NET_PROTOCOL_NBD:
|
||||
case VIR_STORAGE_NET_PROTOCOL_RBD:
|
||||
+ case VIR_STORAGE_NET_PROTOCOL_VITASTOR:
|
||||
case VIR_STORAGE_NET_PROTOCOL_SHEEPDOG:
|
||||
case VIR_STORAGE_NET_PROTOCOL_GLUSTER:
|
||||
case VIR_STORAGE_NET_PROTOCOL_ISCSI:
|
||||
diff --git a/src/storage/storage_driver.c b/src/storage/storage_driver.c
|
||||
index 314fe930e0..fb615a8b4e 100644
|
||||
--- a/src/storage/storage_driver.c
|
||||
+++ b/src/storage/storage_driver.c
|
||||
@@ -1626,6 +1626,7 @@ storageVolLookupByPathCallback(virStoragePoolObj *obj,
|
||||
|
||||
case VIR_STORAGE_POOL_GLUSTER:
|
||||
case VIR_STORAGE_POOL_RBD:
|
||||
+ case VIR_STORAGE_POOL_VITASTOR:
|
||||
case VIR_STORAGE_POOL_SHEEPDOG:
|
||||
case VIR_STORAGE_POOL_ZFS:
|
||||
case VIR_STORAGE_POOL_LAST:
|
||||
diff --git a/src/storage_file/storage_source_backingstore.c b/src/storage_file/storage_source_backingstore.c
|
||||
index 80681924ea..8a3ade9ec0 100644
|
||||
--- a/src/storage_file/storage_source_backingstore.c
|
||||
+++ b/src/storage_file/storage_source_backingstore.c
|
||||
@@ -287,6 +287,75 @@ virStorageSourceParseRBDColonString(const char *rbdstr,
|
||||
}
|
||||
|
||||
|
||||
+static int
|
||||
+virStorageSourceParseVitastorColonString(const char *colonstr,
|
||||
+ virStorageSource *src)
|
||||
+{
|
||||
+ char *p, *e, *next;
|
||||
+ g_autofree char *options = NULL;
|
||||
+
|
||||
+ /* optionally skip the "vitastor:" prefix if provided */
|
||||
+ if (STRPREFIX(colonstr, "vitastor:"))
|
||||
+ colonstr += strlen("vitastor:");
|
||||
+
|
||||
+ options = g_strdup(colonstr);
|
||||
+
|
||||
+ p = options;
|
||||
+ while (*p) {
|
||||
+ /* find : delimiter or end of string */
|
||||
+ for (e = p; *e && *e != ':'; ++e) {
|
||||
+ if (*e == '\\') {
|
||||
+ e++;
|
||||
+ if (*e == '\0')
|
||||
+ break;
|
||||
+ }
|
||||
+ }
|
||||
+ if (*e == '\0') {
|
||||
+ next = e; /* last kv pair */
|
||||
+ } else {
|
||||
+ next = e + 1;
|
||||
+ *e = '\0';
|
||||
+ }
|
||||
+
|
||||
+ if (STRPREFIX(p, "image=")) {
|
||||
+ src->path = g_strdup(p + strlen("image="));
|
||||
+ } else if (STRPREFIX(p, "etcd-prefix=")) {
|
||||
+ src->query = g_strdup(p + strlen("etcd-prefix="));
|
||||
+ } else if (STRPREFIX(p, "config-path=")) {
|
||||
+ src->configFile = g_strdup(p + strlen("config-path="));
|
||||
+ } else if (STRPREFIX(p, "etcd-host=")) {
|
||||
+ char *h, *sep;
|
||||
+
|
||||
+ h = p + strlen("etcd-host=");
|
||||
+ while (h < e) {
|
||||
+ for (sep = h; sep < e; ++sep) {
|
||||
+ if (*sep == '\\' && (sep[1] == ',' ||
|
||||
+ sep[1] == ';' ||
|
||||
+ sep[1] == ' ')) {
|
||||
+ *sep = '\0';
|
||||
+ sep += 2;
|
||||
+ break;
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
+ if (virStorageSourceRBDAddHost(src, h) < 0)
|
||||
+ return -1;
|
||||
+
|
||||
+ h = sep;
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
+ p = next;
|
||||
+ }
|
||||
+
|
||||
+ if (!src->path) {
|
||||
+ return -1;
|
||||
+ }
|
||||
+
|
||||
+ return 0;
|
||||
+}
|
||||
+
|
||||
+
|
||||
static int
|
||||
virStorageSourceParseNBDColonString(const char *nbdstr,
|
||||
virStorageSource *src)
|
||||
@@ -399,6 +468,11 @@ virStorageSourceParseBackingColon(virStorageSource *src,
|
||||
return -1;
|
||||
break;
|
||||
|
||||
+ case VIR_STORAGE_NET_PROTOCOL_VITASTOR:
|
||||
+ if (virStorageSourceParseVitastorColonString(path, src) < 0)
|
||||
+ return -1;
|
||||
+ break;
|
||||
+
|
||||
case VIR_STORAGE_NET_PROTOCOL_SHEEPDOG:
|
||||
case VIR_STORAGE_NET_PROTOCOL_LAST:
|
||||
case VIR_STORAGE_NET_PROTOCOL_NONE:
|
||||
@@ -975,6 +1049,54 @@ virStorageSourceParseBackingJSONRBD(virStorageSource *src,
|
||||
return 0;
|
||||
}
|
||||
|
||||
+static int
|
||||
+virStorageSourceParseBackingJSONVitastor(virStorageSource *src,
|
||||
+ virJSONValue *json,
|
||||
+ const char *jsonstr G_GNUC_UNUSED,
|
||||
+ int opaque G_GNUC_UNUSED)
|
||||
+{
|
||||
+ const char *filename;
|
||||
+ const char *image = virJSONValueObjectGetString(json, "image");
|
||||
+ const char *conf = virJSONValueObjectGetString(json, "config-path");
|
||||
+ const char *etcd_prefix = virJSONValueObjectGetString(json, "etcd-prefix");
|
||||
+ virJSONValue *servers = virJSONValueObjectGetArray(json, "server");
|
||||
+ size_t nservers;
|
||||
+ size_t i;
|
||||
+
|
||||
+ src->type = VIR_STORAGE_TYPE_NETWORK;
|
||||
+ src->protocol = VIR_STORAGE_NET_PROTOCOL_VITASTOR;
|
||||
+
|
||||
+ /* legacy syntax passed via 'filename' option */
|
||||
+ if ((filename = virJSONValueObjectGetString(json, "filename")))
|
||||
+ return virStorageSourceParseVitastorColonString(filename, src);
|
||||
+
|
||||
+ if (!image) {
|
||||
+ virReportError(VIR_ERR_INVALID_ARG, "%s",
|
||||
+ _("missing image name in Vitastor backing volume "
|
||||
+ "JSON specification"));
|
||||
+ return -1;
|
||||
+ }
|
||||
+
|
||||
+ src->path = g_strdup(image);
|
||||
+ src->configFile = g_strdup(conf);
|
||||
+ src->query = g_strdup(etcd_prefix);
|
||||
+
|
||||
+ if (servers) {
|
||||
+ nservers = virJSONValueArraySize(servers);
|
||||
+
|
||||
+ src->hosts = g_new0(virStorageNetHostDef, nservers);
|
||||
+ src->nhosts = nservers;
|
||||
+
|
||||
+ for (i = 0; i < nservers; i++) {
|
||||
+ if (virStorageSourceParseBackingJSONInetSocketAddress(src->hosts + i,
|
||||
+ virJSONValueArrayGet(servers, i)) < 0)
|
||||
+ return -1;
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
+ return 0;
|
||||
+}
|
||||
+
|
||||
static int
|
||||
virStorageSourceParseBackingJSONRaw(virStorageSource *src,
|
||||
virJSONValue *json,
|
||||
@@ -1152,6 +1274,7 @@ static const struct virStorageSourceJSONDriverParser jsonParsers[] = {
|
||||
{"sheepdog", false, virStorageSourceParseBackingJSONSheepdog, 0},
|
||||
{"ssh", false, virStorageSourceParseBackingJSONSSH, 0},
|
||||
{"rbd", false, virStorageSourceParseBackingJSONRBD, 0},
|
||||
+ {"vitastor", false, virStorageSourceParseBackingJSONVitastor, 0},
|
||||
{"raw", true, virStorageSourceParseBackingJSONRaw, 0},
|
||||
{"nfs", false, virStorageSourceParseBackingJSONNFS, 0},
|
||||
{"vxhs", false, virStorageSourceParseBackingJSONVxHS, 0},
|
||||
diff --git a/src/test/test_driver.c b/src/test/test_driver.c
|
||||
index e87d7cfd44..ccc05d7aae 100644
|
||||
--- a/src/test/test_driver.c
|
||||
+++ b/src/test/test_driver.c
|
||||
@@ -7335,6 +7335,7 @@ testStorageVolumeTypeForPool(int pooltype)
|
||||
case VIR_STORAGE_POOL_ISCSI_DIRECT:
|
||||
case VIR_STORAGE_POOL_GLUSTER:
|
||||
case VIR_STORAGE_POOL_RBD:
|
||||
+ case VIR_STORAGE_POOL_VITASTOR:
|
||||
return VIR_STORAGE_VOL_NETWORK;
|
||||
case VIR_STORAGE_POOL_LOGICAL:
|
||||
case VIR_STORAGE_POOL_DISK:
|
||||
diff --git a/tests/storagepoolcapsschemadata/poolcaps-fs.xml b/tests/storagepoolcapsschemadata/poolcaps-fs.xml
|
||||
index eee75af746..8bd0a57bdd 100644
|
||||
--- a/tests/storagepoolcapsschemadata/poolcaps-fs.xml
|
||||
+++ b/tests/storagepoolcapsschemadata/poolcaps-fs.xml
|
||||
@@ -204,4 +204,11 @@
|
||||
</enum>
|
||||
</volOptions>
|
||||
</pool>
|
||||
+ <pool type='vitastor' supported='no'>
|
||||
+ <volOptions>
|
||||
+ <defaultFormat type='raw'/>
|
||||
+ <enum name='targetFormatType'>
|
||||
+ </enum>
|
||||
+ </volOptions>
|
||||
+ </pool>
|
||||
</storagepoolCapabilities>
|
||||
diff --git a/tests/storagepoolcapsschemadata/poolcaps-full.xml b/tests/storagepoolcapsschemadata/poolcaps-full.xml
|
||||
index 805950a937..852df0de16 100644
|
||||
--- a/tests/storagepoolcapsschemadata/poolcaps-full.xml
|
||||
+++ b/tests/storagepoolcapsschemadata/poolcaps-full.xml
|
||||
@@ -204,4 +204,11 @@
|
||||
</enum>
|
||||
</volOptions>
|
||||
</pool>
|
||||
+ <pool type='vitastor' supported='yes'>
|
||||
+ <volOptions>
|
||||
+ <defaultFormat type='raw'/>
|
||||
+ <enum name='targetFormatType'>
|
||||
+ </enum>
|
||||
+ </volOptions>
|
||||
+ </pool>
|
||||
</storagepoolCapabilities>
|
||||
diff --git a/tests/storagepoolxml2argvtest.c b/tests/storagepoolxml2argvtest.c
|
||||
index e8e40d695e..db55fe5f3a 100644
|
||||
--- a/tests/storagepoolxml2argvtest.c
|
||||
+++ b/tests/storagepoolxml2argvtest.c
|
||||
@@ -65,6 +65,7 @@ testCompareXMLToArgvFiles(bool shouldFail,
|
||||
case VIR_STORAGE_POOL_GLUSTER:
|
||||
case VIR_STORAGE_POOL_ZFS:
|
||||
case VIR_STORAGE_POOL_VSTORAGE:
|
||||
+ case VIR_STORAGE_POOL_VITASTOR:
|
||||
case VIR_STORAGE_POOL_LAST:
|
||||
default:
|
||||
VIR_TEST_DEBUG("pool type '%s' has no xml2argv test", defTypeStr);
|
||||
diff --git a/tools/virsh-pool.c b/tools/virsh-pool.c
|
||||
index 36f00cf643..5f5bd3464e 100644
|
||||
--- a/tools/virsh-pool.c
|
||||
+++ b/tools/virsh-pool.c
|
||||
@@ -1223,6 +1223,9 @@ cmdPoolList(vshControl *ctl, const vshCmd *cmd G_GNUC_UNUSED)
|
||||
case VIR_STORAGE_POOL_VSTORAGE:
|
||||
flags |= VIR_CONNECT_LIST_STORAGE_POOLS_VSTORAGE;
|
||||
break;
|
||||
+ case VIR_STORAGE_POOL_VITASTOR:
|
||||
+ flags |= VIR_CONNECT_LIST_STORAGE_POOLS_VITASTOR;
|
||||
+ break;
|
||||
case VIR_STORAGE_POOL_LAST:
|
||||
break;
|
||||
}
|
28
pull_request_template.yml
Normal file
28
pull_request_template.yml
Normal file
@@ -0,0 +1,28 @@
|
||||
name: Pull Request
|
||||
about: Submit a pull request
|
||||
body:
|
||||
- type: textarea
|
||||
id: description
|
||||
attributes:
|
||||
label: Description
|
||||
description: Describe your pull request
|
||||
placeholder: ""
|
||||
value: ""
|
||||
validations:
|
||||
required: true
|
||||
- type: input
|
||||
id: author
|
||||
attributes:
|
||||
label: Contributor Name
|
||||
description: Contributor Name or Company Details if the Contributor is a company
|
||||
placeholder: ""
|
||||
validations:
|
||||
required: false
|
||||
- type: checkboxes
|
||||
id: terms
|
||||
attributes:
|
||||
label: CLA
|
||||
description: By submitting this pull request, I accept [Vitastor CLA](https://git.yourcmc.ru/vitalif/vitastor/src/branch/master/CLA-en.md)
|
||||
options:
|
||||
- label: "I accept Vitastor CLA agreement: https://git.yourcmc.ru/vitalif/vitastor/src/branch/master/CLA-en.md"
|
||||
required: true
|
@@ -24,4 +24,4 @@ rm fio
|
||||
mv fio-copy fio
|
||||
FIO=`rpm -qi fio | perl -e 'while(<>) { /^Epoch[\s:]+(\S+)/ && print "$1:"; /^Version[\s:]+(\S+)/ && print $1; /^Release[\s:]+(\S+)/ && print "-$1"; }'`
|
||||
perl -i -pe 's/(Requires:\s*fio)([^\n]+)?/$1 = '$FIO'/' $VITASTOR/rpm/vitastor-el$EL.spec
|
||||
tar --transform 's#^#vitastor-1.3.1/#' --exclude 'rpm/*.rpm' -czf $VITASTOR/../vitastor-1.3.1$(rpm --eval '%dist').tar.gz *
|
||||
tar --transform 's#^#vitastor-1.4.8/#' --exclude 'rpm/*.rpm' -czf $VITASTOR/../vitastor-1.4.8$(rpm --eval '%dist').tar.gz *
|
||||
|
@@ -36,7 +36,7 @@ ADD . /root/vitastor
|
||||
RUN set -e; \
|
||||
cd /root/vitastor/rpm; \
|
||||
sh build-tarball.sh; \
|
||||
cp /root/vitastor-1.3.1.el7.tar.gz ~/rpmbuild/SOURCES; \
|
||||
cp /root/vitastor-1.4.8.el7.tar.gz ~/rpmbuild/SOURCES; \
|
||||
cp vitastor-el7.spec ~/rpmbuild/SPECS/vitastor.spec; \
|
||||
cd ~/rpmbuild/SPECS/; \
|
||||
rpmbuild -ba vitastor.spec; \
|
||||
|
@@ -1,11 +1,11 @@
|
||||
Name: vitastor
|
||||
Version: 1.3.1
|
||||
Version: 1.4.8
|
||||
Release: 1%{?dist}
|
||||
Summary: Vitastor, a fast software-defined clustered block storage
|
||||
|
||||
License: Vitastor Network Public License 1.1
|
||||
URL: https://vitastor.io/
|
||||
Source0: vitastor-1.3.1.el7.tar.gz
|
||||
Source0: vitastor-1.4.8.el7.tar.gz
|
||||
|
||||
BuildRequires: liburing-devel >= 0.6
|
||||
BuildRequires: gperftools-devel
|
||||
|
@@ -35,7 +35,7 @@ ADD . /root/vitastor
|
||||
RUN set -e; \
|
||||
cd /root/vitastor/rpm; \
|
||||
sh build-tarball.sh; \
|
||||
cp /root/vitastor-1.3.1.el8.tar.gz ~/rpmbuild/SOURCES; \
|
||||
cp /root/vitastor-1.4.8.el8.tar.gz ~/rpmbuild/SOURCES; \
|
||||
cp vitastor-el8.spec ~/rpmbuild/SPECS/vitastor.spec; \
|
||||
cd ~/rpmbuild/SPECS/; \
|
||||
rpmbuild -ba vitastor.spec; \
|
||||
|
@@ -1,11 +1,11 @@
|
||||
Name: vitastor
|
||||
Version: 1.3.1
|
||||
Version: 1.4.8
|
||||
Release: 1%{?dist}
|
||||
Summary: Vitastor, a fast software-defined clustered block storage
|
||||
|
||||
License: Vitastor Network Public License 1.1
|
||||
URL: https://vitastor.io/
|
||||
Source0: vitastor-1.3.1.el8.tar.gz
|
||||
Source0: vitastor-1.4.8.el8.tar.gz
|
||||
|
||||
BuildRequires: liburing-devel >= 0.6
|
||||
BuildRequires: gperftools-devel
|
||||
|
@@ -18,7 +18,7 @@ ADD . /root/vitastor
|
||||
RUN set -e; \
|
||||
cd /root/vitastor/rpm; \
|
||||
sh build-tarball.sh; \
|
||||
cp /root/vitastor-1.3.1.el9.tar.gz ~/rpmbuild/SOURCES; \
|
||||
cp /root/vitastor-1.4.8.el9.tar.gz ~/rpmbuild/SOURCES; \
|
||||
cp vitastor-el9.spec ~/rpmbuild/SPECS/vitastor.spec; \
|
||||
cd ~/rpmbuild/SPECS/; \
|
||||
rpmbuild -ba vitastor.spec; \
|
||||
|
@@ -1,11 +1,11 @@
|
||||
Name: vitastor
|
||||
Version: 1.3.1
|
||||
Version: 1.4.8
|
||||
Release: 1%{?dist}
|
||||
Summary: Vitastor, a fast software-defined clustered block storage
|
||||
|
||||
License: Vitastor Network Public License 1.1
|
||||
URL: https://vitastor.io/
|
||||
Source0: vitastor-1.3.1.el9.tar.gz
|
||||
Source0: vitastor-1.4.8.el9.tar.gz
|
||||
|
||||
BuildRequires: liburing-devel >= 0.6
|
||||
BuildRequires: gperftools-devel
|
||||
|
@@ -16,8 +16,8 @@ if("${CMAKE_INSTALL_PREFIX}" MATCHES "^/usr/local/?$")
|
||||
set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}")
|
||||
endif()
|
||||
|
||||
add_definitions(-DVERSION="1.3.1")
|
||||
add_definitions(-Wall -Wno-sign-compare -Wno-comment -Wno-parentheses -Wno-pointer-arith -fdiagnostics-color=always -fno-omit-frame-pointer -I ${CMAKE_SOURCE_DIR}/src)
|
||||
add_definitions(-DVERSION="1.4.8")
|
||||
add_definitions(-D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64 -Wall -Wno-sign-compare -Wno-comment -Wno-parentheses -Wno-pointer-arith -fdiagnostics-color=always -fno-omit-frame-pointer -I ${CMAKE_SOURCE_DIR}/src)
|
||||
add_link_options(-fno-omit-frame-pointer)
|
||||
if (${WITH_ASAN})
|
||||
add_definitions(-fsanitize=address)
|
||||
|
@@ -8,6 +8,7 @@
|
||||
#include <stdio.h>
|
||||
|
||||
#include <stdexcept>
|
||||
#include <set>
|
||||
|
||||
#include "addr_util.h"
|
||||
|
||||
@@ -135,7 +136,7 @@ std::vector<std::string> getifaddr_list(std::vector<std::string> mask_cfg, bool
|
||||
throw std::runtime_error((include_v6 ? "Invalid IPv4 address mask: " : "Invalid IP address mask: ") + mask);
|
||||
}
|
||||
}
|
||||
std::vector<std::string> addresses;
|
||||
std::set<std::string> addresses;
|
||||
ifaddrs *list, *ifa;
|
||||
if (getifaddrs(&list) == -1)
|
||||
{
|
||||
@@ -149,7 +150,8 @@ std::vector<std::string> getifaddr_list(std::vector<std::string> mask_cfg, bool
|
||||
}
|
||||
int family = ifa->ifa_addr->sa_family;
|
||||
if ((family == AF_INET || family == AF_INET6 && include_v6) &&
|
||||
(ifa->ifa_flags & (IFF_UP | IFF_RUNNING | IFF_LOOPBACK)) == (IFF_UP | IFF_RUNNING))
|
||||
// Do not skip loopback addresses if the address filter is specified
|
||||
(ifa->ifa_flags & (IFF_UP | IFF_RUNNING | (masks.size() ? 0 : IFF_LOOPBACK))) == (IFF_UP | IFF_RUNNING))
|
||||
{
|
||||
void *addr_ptr;
|
||||
if (family == AF_INET)
|
||||
@@ -182,11 +184,11 @@ std::vector<std::string> getifaddr_list(std::vector<std::string> mask_cfg, bool
|
||||
{
|
||||
throw std::runtime_error(std::string("inet_ntop: ") + strerror(errno));
|
||||
}
|
||||
addresses.push_back(std::string(addr));
|
||||
addresses.insert(std::string(addr));
|
||||
}
|
||||
}
|
||||
freeifaddrs(list);
|
||||
return addresses;
|
||||
return std::vector<std::string>(addresses.begin(), addresses.end());
|
||||
}
|
||||
|
||||
int create_and_bind_socket(std::string bind_address, int bind_port, int listen_backlog, int *listening_port)
|
||||
|
@@ -108,6 +108,10 @@ void blockstore_disk_t::parse_config(std::map<std::string, std::string> & config
|
||||
{
|
||||
throw std::runtime_error("journal_block_size must be a multiple of "+std::to_string(DIRECT_IO_ALIGNMENT));
|
||||
}
|
||||
else if (journal_block_size > MAX_DATA_BLOCK_SIZE)
|
||||
{
|
||||
throw std::runtime_error("journal_block_size must not exceed "+std::to_string(MAX_DATA_BLOCK_SIZE));
|
||||
}
|
||||
if (!meta_block_size)
|
||||
{
|
||||
meta_block_size = 4096;
|
||||
@@ -116,6 +120,10 @@ void blockstore_disk_t::parse_config(std::map<std::string, std::string> & config
|
||||
{
|
||||
throw std::runtime_error("meta_block_size must be a multiple of "+std::to_string(DIRECT_IO_ALIGNMENT));
|
||||
}
|
||||
else if (meta_block_size > MAX_DATA_BLOCK_SIZE)
|
||||
{
|
||||
throw std::runtime_error("meta_block_size must not exceed "+std::to_string(MAX_DATA_BLOCK_SIZE));
|
||||
}
|
||||
if (data_offset % disk_alignment)
|
||||
{
|
||||
throw std::runtime_error("data_offset must be a multiple of disk_alignment = "+std::to_string(disk_alignment));
|
||||
|
@@ -19,7 +19,6 @@ journal_flusher_t::journal_flusher_t(blockstore_impl_t *bs)
|
||||
syncing_flushers = 0;
|
||||
// FIXME: allow to configure flusher_start_threshold and journal_trim_interval
|
||||
flusher_start_threshold = bs->dsk.journal_block_size / sizeof(journal_entry_stable);
|
||||
journal_trim_interval = 512;
|
||||
journal_trim_counter = bs->journal.flush_journal ? 1 : 0;
|
||||
trim_wanted = bs->journal.flush_journal ? 1 : 0;
|
||||
journal_superblock = bs->journal.inmemory ? bs->journal.buffer : memalign_or_die(MEM_ALIGNMENT, bs->dsk.journal_block_size);
|
||||
@@ -94,7 +93,7 @@ void journal_flusher_t::loop()
|
||||
void journal_flusher_t::enqueue_flush(obj_ver_id ov)
|
||||
{
|
||||
#ifdef BLOCKSTORE_DEBUG
|
||||
printf("enqueue_flush %lx:%lx v%lu\n", ov.oid.inode, ov.oid.stripe, ov.version);
|
||||
printf("enqueue_flush %jx:%jx v%ju\n", ov.oid.inode, ov.oid.stripe, ov.version);
|
||||
#endif
|
||||
auto it = flush_versions.find(ov.oid);
|
||||
if (it != flush_versions.end())
|
||||
@@ -117,7 +116,7 @@ void journal_flusher_t::enqueue_flush(obj_ver_id ov)
|
||||
void journal_flusher_t::unshift_flush(obj_ver_id ov, bool force)
|
||||
{
|
||||
#ifdef BLOCKSTORE_DEBUG
|
||||
printf("unshift_flush %lx:%lx v%lu\n", ov.oid.inode, ov.oid.stripe, ov.version);
|
||||
printf("unshift_flush %jx:%jx v%ju\n", ov.oid.inode, ov.oid.stripe, ov.version);
|
||||
#endif
|
||||
auto it = flush_versions.find(ov.oid);
|
||||
if (it != flush_versions.end())
|
||||
@@ -143,7 +142,7 @@ void journal_flusher_t::unshift_flush(obj_ver_id ov, bool force)
|
||||
void journal_flusher_t::remove_flush(object_id oid)
|
||||
{
|
||||
#ifdef BLOCKSTORE_DEBUG
|
||||
printf("undo_flush %lx:%lx\n", oid.inode, oid.stripe);
|
||||
printf("undo_flush %jx:%jx\n", oid.inode, oid.stripe);
|
||||
#endif
|
||||
auto v_it = flush_versions.find(oid);
|
||||
if (v_it != flush_versions.end())
|
||||
@@ -184,8 +183,7 @@ void journal_flusher_t::mark_trim_possible()
|
||||
if (trim_wanted > 0)
|
||||
{
|
||||
dequeuing = true;
|
||||
if (!journal_trim_counter)
|
||||
journal_trim_counter = journal_trim_interval;
|
||||
journal_trim_counter = 0;
|
||||
bs->ringloop->wakeup();
|
||||
}
|
||||
}
|
||||
@@ -235,7 +233,7 @@ void journal_flusher_t::dump_diagnostics()
|
||||
break;
|
||||
}
|
||||
printf(
|
||||
"Flusher: queued=%ld first=%s%lx:%lx trim_wanted=%d dequeuing=%d trimming=%d cur=%d target=%d active=%d syncing=%d\n",
|
||||
"Flusher: queued=%zd first=%s%jx:%jx trim_wanted=%d dequeuing=%d trimming=%d cur=%d target=%d active=%d syncing=%d\n",
|
||||
flush_queue.size(), unflushable_type, unflushable.oid.inode, unflushable.oid.stripe,
|
||||
trim_wanted, dequeuing, trimming, cur_flusher_count, target_flusher_count,
|
||||
active_flushers, syncing_flushers
|
||||
@@ -268,7 +266,7 @@ bool journal_flusher_t::try_find_other(std::map<obj_ver_id, dirty_entry>::iterat
|
||||
{
|
||||
int search_left = flush_queue.size() - 1;
|
||||
#ifdef BLOCKSTORE_DEBUG
|
||||
printf("Flusher overran writers (%lx:%lx v%lu, dirty_start=%08lx) - searching for older flushes (%d left)\n",
|
||||
printf("Flusher overran writers (%jx:%jx v%ju, dirty_start=%08jx) - searching for older flushes (%d left)\n",
|
||||
cur.oid.inode, cur.oid.stripe, cur.version, bs->journal.dirty_start, search_left);
|
||||
#endif
|
||||
while (search_left > 0)
|
||||
@@ -285,7 +283,7 @@ bool journal_flusher_t::try_find_other(std::map<obj_ver_id, dirty_entry>::iterat
|
||||
dirty_end->second.journal_sector < bs->journal.used_start))
|
||||
{
|
||||
#ifdef BLOCKSTORE_DEBUG
|
||||
printf("Write %lx:%lx v%lu is too new: offset=%08lx\n", cur.oid.inode, cur.oid.stripe, cur.version, dirty_end->second.journal_sector);
|
||||
printf("Write %jx:%jx v%ju is too new: offset=%08jx\n", cur.oid.inode, cur.oid.stripe, cur.version, dirty_end->second.journal_sector);
|
||||
#endif
|
||||
enqueue_flush(cur);
|
||||
}
|
||||
@@ -366,9 +364,10 @@ resume_0:
|
||||
!flusher->flush_queue.size() || !flusher->dequeuing)
|
||||
{
|
||||
stop_flusher:
|
||||
if (flusher->trim_wanted > 0 && flusher->journal_trim_counter > 0)
|
||||
if (flusher->trim_wanted > 0 && cur.oid.inode != 0)
|
||||
{
|
||||
// Attempt forced trim
|
||||
cur.oid = {};
|
||||
flusher->active_flushers++;
|
||||
goto trim_journal;
|
||||
}
|
||||
@@ -387,7 +386,7 @@ stop_flusher:
|
||||
if (repeat_it != flusher->sync_to_repeat.end())
|
||||
{
|
||||
#ifdef BLOCKSTORE_DEBUG
|
||||
printf("Postpone %lx:%lx v%lu\n", cur.oid.inode, cur.oid.stripe, cur.version);
|
||||
printf("Postpone %jx:%jx v%ju\n", cur.oid.inode, cur.oid.stripe, cur.version);
|
||||
#endif
|
||||
// We don't flush different parts of history of the same object in parallel
|
||||
// So we check if someone is already flushing this object
|
||||
@@ -416,12 +415,13 @@ stop_flusher:
|
||||
flusher->sync_to_repeat.erase(cur.oid);
|
||||
if (!flusher->try_find_other(dirty_end, cur))
|
||||
{
|
||||
cur.oid = {};
|
||||
goto stop_flusher;
|
||||
}
|
||||
}
|
||||
}
|
||||
#ifdef BLOCKSTORE_DEBUG
|
||||
printf("Flushing %lx:%lx v%lu\n", cur.oid.inode, cur.oid.stripe, cur.version);
|
||||
printf("Flushing %jx:%jx v%ju\n", cur.oid.inode, cur.oid.stripe, cur.version);
|
||||
#endif
|
||||
flusher->active_flushers++;
|
||||
// Find it in clean_db
|
||||
@@ -448,7 +448,7 @@ stop_flusher:
|
||||
// Object not allocated. This is a bug.
|
||||
char err[1024];
|
||||
snprintf(
|
||||
err, 1024, "BUG: Object %lx:%lx v%lu that we are trying to flush is not allocated on the data device",
|
||||
err, 1024, "BUG: Object %jx:%jx v%ju that we are trying to flush is not allocated on the data device",
|
||||
cur.oid.inode, cur.oid.stripe, cur.version
|
||||
);
|
||||
throw std::runtime_error(err);
|
||||
@@ -538,7 +538,7 @@ resume_2:
|
||||
clean_disk_entry *old_entry = (clean_disk_entry*)((uint8_t*)meta_old.buf + meta_old.pos*bs->dsk.clean_entry_size);
|
||||
if (old_entry->oid.inode != 0 && old_entry->oid != cur.oid)
|
||||
{
|
||||
printf("Fatal error (metadata corruption or bug): tried to wipe metadata entry %lu (%lx:%lx v%lu) as old location of %lx:%lx\n",
|
||||
printf("Fatal error (metadata corruption or bug): tried to wipe metadata entry %ju (%jx:%jx v%ju) as old location of %jx:%jx\n",
|
||||
old_clean_loc >> bs->dsk.block_order, old_entry->oid.inode, old_entry->oid.stripe,
|
||||
old_entry->version, cur.oid.inode, cur.oid.stripe);
|
||||
exit(1);
|
||||
@@ -571,7 +571,7 @@ resume_2:
|
||||
// Erase dirty_db entries
|
||||
bs->erase_dirty(dirty_start, std::next(dirty_end), clean_loc);
|
||||
#ifdef BLOCKSTORE_DEBUG
|
||||
printf("Flushed %lx:%lx v%lu (%d copies, wr:%d, del:%d), %ld left\n", cur.oid.inode, cur.oid.stripe, cur.version,
|
||||
printf("Flushed %jx:%jx v%ju (%d copies, wr:%d, del:%d), %jd left\n", cur.oid.inode, cur.oid.stripe, cur.version,
|
||||
copy_count, has_writes, has_delete, flusher->flush_queue.size());
|
||||
#endif
|
||||
release_oid:
|
||||
@@ -584,7 +584,8 @@ resume_2:
|
||||
flusher->sync_to_repeat.erase(repeat_it);
|
||||
trim_journal:
|
||||
// Clear unused part of the journal every <journal_trim_interval> flushes
|
||||
if (!((++flusher->journal_trim_counter) % flusher->journal_trim_interval) || flusher->trim_wanted > 0)
|
||||
if (bs->journal_trim_interval && !((++flusher->journal_trim_counter) % bs->journal_trim_interval) ||
|
||||
flusher->trim_wanted > 0)
|
||||
{
|
||||
resume_26:
|
||||
resume_27:
|
||||
@@ -609,8 +610,8 @@ void journal_flusher_co::update_metadata_entry()
|
||||
{
|
||||
printf(
|
||||
has_delete
|
||||
? "Fatal error (metadata corruption or bug): tried to delete metadata entry %lu (%lx:%lx v%lu) while deleting %lx:%lx v%lu\n"
|
||||
: "Fatal error (metadata corruption or bug): tried to overwrite non-zero metadata entry %lu (%lx:%lx v%lu) with %lx:%lx v%lu\n",
|
||||
? "Fatal error (metadata corruption or bug): tried to delete metadata entry %ju (%jx:%jx v%ju) while deleting %jx:%jx v%ju\n"
|
||||
: "Fatal error (metadata corruption or bug): tried to overwrite non-zero metadata entry %ju (%jx:%jx v%ju) with %jx:%jx v%ju\n",
|
||||
clean_loc >> bs->dsk.block_order, new_entry->oid.inode, new_entry->oid.stripe,
|
||||
new_entry->version, cur.oid.inode, cur.oid.stripe, cur.version
|
||||
);
|
||||
@@ -710,7 +711,7 @@ bool journal_flusher_co::write_meta_block(flusher_meta_write_t & meta_block, int
|
||||
if (wait_state == wait_base)
|
||||
goto resume_0;
|
||||
await_sqe(0);
|
||||
data->iov = (struct iovec){ meta_block.buf, bs->dsk.meta_block_size };
|
||||
data->iov = (struct iovec){ meta_block.buf, (size_t)bs->dsk.meta_block_size };
|
||||
data->callback = simple_callback_w;
|
||||
my_uring_prep_writev(
|
||||
sqe, bs->dsk.meta_fd, &data->iov, 1, bs->dsk.meta_offset + bs->dsk.meta_block_size + meta_block.sector
|
||||
@@ -760,7 +761,7 @@ bool journal_flusher_co::clear_incomplete_csum_block_bits(int wait_base)
|
||||
{
|
||||
// If we encounter bad checksums during flush, we still update the bad block,
|
||||
// but intentionally mangle checksums to avoid hiding the corruption.
|
||||
iovec iov = { .iov_base = v[i].buf, .iov_len = v[i].len };
|
||||
iovec iov = { .iov_base = v[i].buf, .iov_len = (size_t)v[i].len };
|
||||
if (!(v[i].copy_flags & COPY_BUF_JOURNAL))
|
||||
{
|
||||
assert(!(v[i].offset % bs->dsk.csum_block_size));
|
||||
@@ -768,7 +769,7 @@ bool journal_flusher_co::clear_incomplete_csum_block_bits(int wait_base)
|
||||
bs->verify_padded_checksums(new_clean_bitmap, new_clean_bitmap + 2*bs->dsk.clean_entry_bitmap_size,
|
||||
v[i].offset, &iov, 1, [&](uint32_t bad_block, uint32_t calc_csum, uint32_t stored_csum)
|
||||
{
|
||||
printf("Checksum mismatch in object %lx:%lx v%lu in data area at offset 0x%lx+0x%x: got %08x, expected %08x\n",
|
||||
printf("Checksum mismatch in object %jx:%jx v%ju in data area at offset 0x%jx+0x%x: got %08x, expected %08x\n",
|
||||
cur.oid.inode, cur.oid.stripe, old_clean_ver, old_clean_loc, bad_block, calc_csum, stored_csum);
|
||||
for (uint32_t j = 0; j < bs->dsk.csum_block_size; j += bs->dsk.bitmap_granularity)
|
||||
{
|
||||
@@ -781,7 +782,7 @@ bool journal_flusher_co::clear_incomplete_csum_block_bits(int wait_base)
|
||||
{
|
||||
bs->verify_journal_checksums(v[i].csum_buf, v[i].offset, &iov, 1, [&](uint32_t bad_block, uint32_t calc_csum, uint32_t stored_csum)
|
||||
{
|
||||
printf("Checksum mismatch in object %lx:%lx v%lu in journal at offset 0x%lx+0x%x (block offset 0x%lx): got %08x, expected %08x\n",
|
||||
printf("Checksum mismatch in object %jx:%jx v%ju in journal at offset 0x%jx+0x%x (block offset 0x%jx): got %08x, expected %08x\n",
|
||||
cur.oid.inode, cur.oid.stripe, old_clean_ver,
|
||||
v[i].disk_offset, bad_block, v[i].offset, calc_csum, stored_csum);
|
||||
bad_block += (v[i].offset/bs->dsk.csum_block_size) * bs->dsk.csum_block_size;
|
||||
@@ -805,7 +806,7 @@ bool journal_flusher_co::clear_incomplete_csum_block_bits(int wait_base)
|
||||
if (new_entry->oid != cur.oid)
|
||||
{
|
||||
printf(
|
||||
"Fatal error (metadata corruption or bug): tried to make holes in %lu (%lx:%lx v%lu) with %lx:%lx v%lu\n",
|
||||
"Fatal error (metadata corruption or bug): tried to make holes in %ju (%jx:%jx v%ju) with %jx:%jx v%ju\n",
|
||||
clean_loc >> bs->dsk.block_order, new_entry->oid.inode, new_entry->oid.stripe,
|
||||
new_entry->version, cur.oid.inode, cur.oid.stripe, cur.version
|
||||
);
|
||||
@@ -925,7 +926,7 @@ void journal_flusher_co::scan_dirty()
|
||||
{
|
||||
char err[1024];
|
||||
snprintf(
|
||||
err, 1024, "BUG: Unexpected dirty_entry %lx:%lx v%lu unstable state during flush: 0x%x",
|
||||
err, 1024, "BUG: Unexpected dirty_entry %jx:%jx v%ju unstable state during flush: 0x%x",
|
||||
dirty_it->first.oid.inode, dirty_it->first.oid.stripe, dirty_it->first.version, dirty_it->second.state
|
||||
);
|
||||
throw std::runtime_error(err);
|
||||
@@ -1021,7 +1022,7 @@ void journal_flusher_co::scan_dirty()
|
||||
// May happen if the metadata entry is corrupt, but journal isn't
|
||||
// FIXME: Report corrupted object to the upper layer (OSD)
|
||||
printf(
|
||||
"Warning: object %lx:%lx has overwrites, but doesn't have a clean version."
|
||||
"Warning: object %jx:%jx has overwrites, but doesn't have a clean version."
|
||||
" Metadata is likely corrupted. Dropping object from the DB.\n",
|
||||
cur.oid.inode, cur.oid.stripe
|
||||
);
|
||||
@@ -1056,7 +1057,7 @@ void journal_flusher_co::scan_dirty()
|
||||
flusher->enqueue_flush(cur);
|
||||
cur.version = dirty_end->first.version;
|
||||
#ifdef BLOCKSTORE_DEBUG
|
||||
printf("Partial checksum block overwrites found - rewinding flush back to %lx:%lx v%lu\n", cur.oid.inode, cur.oid.stripe, cur.version);
|
||||
printf("Partial checksum block overwrites found - rewinding flush back to %jx:%jx v%ju\n", cur.oid.inode, cur.oid.stripe, cur.version);
|
||||
#endif
|
||||
v.clear();
|
||||
copy_count = 0;
|
||||
@@ -1084,7 +1085,7 @@ bool journal_flusher_co::read_dirty(int wait_base)
|
||||
auto & vi = v[v.size()-i];
|
||||
assert(vi.len != 0);
|
||||
vi.buf = memalign_or_die(MEM_ALIGNMENT, vi.len);
|
||||
data->iov = (struct iovec){ vi.buf, vi.len };
|
||||
data->iov = (struct iovec){ vi.buf, (size_t)vi.len };
|
||||
data->callback = simple_callback_r;
|
||||
my_uring_prep_readv(
|
||||
sqe, bs->dsk.data_fd, &data->iov, 1, bs->dsk.data_offset + old_clean_loc + vi.offset
|
||||
@@ -1208,7 +1209,7 @@ bool journal_flusher_co::modify_meta_read(uint64_t meta_loc, flusher_meta_write_
|
||||
.usage_count = 1,
|
||||
}).first;
|
||||
await_sqe(0);
|
||||
data->iov = (struct iovec){ wr.it->second.buf, bs->dsk.meta_block_size };
|
||||
data->iov = (struct iovec){ wr.it->second.buf, (size_t)bs->dsk.meta_block_size };
|
||||
data->callback = simple_callback_r;
|
||||
wr.submitted = true;
|
||||
my_uring_prep_readv(
|
||||
@@ -1247,7 +1248,7 @@ void journal_flusher_co::free_data_blocks()
|
||||
auto uo_it = bs->used_clean_objects.find(old_clean_loc);
|
||||
bool used = uo_it != bs->used_clean_objects.end();
|
||||
#ifdef BLOCKSTORE_DEBUG
|
||||
printf("%s block %lu from %lx:%lx v%lu (new location is %lu)\n",
|
||||
printf("%s block %ju from %jx:%jx v%ju (new location is %ju)\n",
|
||||
used ? "Postpone free" : "Free",
|
||||
old_clean_loc >> bs->dsk.block_order,
|
||||
cur.oid.inode, cur.oid.stripe, cur.version,
|
||||
@@ -1264,7 +1265,7 @@ void journal_flusher_co::free_data_blocks()
|
||||
auto uo_it = bs->used_clean_objects.find(old_clean_loc);
|
||||
bool used = uo_it != bs->used_clean_objects.end();
|
||||
#ifdef BLOCKSTORE_DEBUG
|
||||
printf("%s block %lu from %lx:%lx v%lu (delete)\n",
|
||||
printf("%s block %ju from %jx:%jx v%ju (delete)\n",
|
||||
used ? "Postpone free" : "Free",
|
||||
old_clean_loc >> bs->dsk.block_order,
|
||||
cur.oid.inode, cur.oid.stripe, cur.version);
|
||||
@@ -1346,7 +1347,6 @@ bool journal_flusher_co::trim_journal(int wait_base)
|
||||
else if (wait_state == wait_base+2) goto resume_2;
|
||||
else if (wait_state == wait_base+3) goto resume_3;
|
||||
else if (wait_state == wait_base+4) goto resume_4;
|
||||
flusher->journal_trim_counter = 0;
|
||||
new_trim_pos = bs->journal.get_trim_pos();
|
||||
if (new_trim_pos != bs->journal.used_start)
|
||||
{
|
||||
@@ -1378,7 +1378,7 @@ bool journal_flusher_co::trim_journal(int wait_base)
|
||||
.csum_block_size = bs->dsk.csum_block_size,
|
||||
};
|
||||
((journal_entry_start*)flusher->journal_superblock)->crc32 = je_crc32((journal_entry*)flusher->journal_superblock);
|
||||
data->iov = (struct iovec){ flusher->journal_superblock, bs->dsk.journal_block_size };
|
||||
data->iov = (struct iovec){ flusher->journal_superblock, (size_t)bs->dsk.journal_block_size };
|
||||
data->callback = simple_callback_w;
|
||||
my_uring_prep_writev(sqe, bs->dsk.journal_fd, &data->iov, 1, bs->journal.offset);
|
||||
wait_count++;
|
||||
@@ -1410,7 +1410,7 @@ bool journal_flusher_co::trim_journal(int wait_base)
|
||||
}
|
||||
bs->journal.used_start = new_trim_pos;
|
||||
#ifdef BLOCKSTORE_DEBUG
|
||||
printf("Journal trimmed to %08lx (next_free=%08lx dirty_start=%08lx)\n", bs->journal.used_start, bs->journal.next_free, bs->journal.dirty_start);
|
||||
printf("Journal trimmed to %08jx (next_free=%08jx dirty_start=%08jx)\n", bs->journal.used_start, bs->journal.next_free, bs->journal.dirty_start);
|
||||
#endif
|
||||
if (bs->journal.flush_journal && !flusher->flush_queue.size())
|
||||
{
|
||||
@@ -1419,6 +1419,7 @@ bool journal_flusher_co::trim_journal(int wait_base)
|
||||
exit(0);
|
||||
}
|
||||
}
|
||||
flusher->journal_trim_counter = 0;
|
||||
flusher->trimming = false;
|
||||
}
|
||||
return true;
|
||||
|
@@ -107,7 +107,7 @@ class journal_flusher_t
|
||||
blockstore_impl_t *bs;
|
||||
friend class journal_flusher_co;
|
||||
|
||||
int journal_trim_counter, journal_trim_interval;
|
||||
int journal_trim_counter;
|
||||
bool trimming;
|
||||
void* journal_superblock;
|
||||
|
||||
|
@@ -163,20 +163,10 @@ void blockstore_impl_t::loop()
|
||||
}
|
||||
else if (op->opcode == BS_OP_SYNC)
|
||||
{
|
||||
// wait for all small writes to be submitted
|
||||
// wait for all big writes to complete, submit data device fsync
|
||||
// sync only completed writes?
|
||||
// wait for the data device fsync to complete, then submit journal writes for big writes
|
||||
// then submit an fsync operation
|
||||
if (has_writes)
|
||||
{
|
||||
// Can't submit SYNC before previous writes
|
||||
continue;
|
||||
}
|
||||
wr_st = continue_sync(op);
|
||||
if (wr_st != 2)
|
||||
{
|
||||
has_writes = wr_st > 0 ? 1 : 2;
|
||||
}
|
||||
}
|
||||
else if (op->opcode == BS_OP_STABLE)
|
||||
{
|
||||
@@ -205,6 +195,10 @@ void blockstore_impl_t::loop()
|
||||
// ring is full, stop submission
|
||||
break;
|
||||
}
|
||||
else if (PRIV(op)->wait_for == WAIT_JOURNAL)
|
||||
{
|
||||
PRIV(op)->wait_detail2 = (unstable_writes.size()+unstable_unsynced);
|
||||
}
|
||||
}
|
||||
}
|
||||
if (op_idx != new_idx)
|
||||
@@ -275,7 +269,7 @@ void blockstore_impl_t::check_wait(blockstore_op_t *op)
|
||||
{
|
||||
// stop submission if there's still no free space
|
||||
#ifdef BLOCKSTORE_DEBUG
|
||||
printf("Still waiting for %lu SQE(s)\n", PRIV(op)->wait_detail);
|
||||
printf("Still waiting for %ju SQE(s)\n", PRIV(op)->wait_detail);
|
||||
#endif
|
||||
return;
|
||||
}
|
||||
@@ -283,11 +277,12 @@ void blockstore_impl_t::check_wait(blockstore_op_t *op)
|
||||
}
|
||||
else if (PRIV(op)->wait_for == WAIT_JOURNAL)
|
||||
{
|
||||
if (journal.used_start == PRIV(op)->wait_detail)
|
||||
if (journal.used_start == PRIV(op)->wait_detail &&
|
||||
(unstable_writes.size()+unstable_unsynced) == PRIV(op)->wait_detail2)
|
||||
{
|
||||
// do not submit
|
||||
#ifdef BLOCKSTORE_DEBUG
|
||||
printf("Still waiting to flush journal offset %08lx\n", PRIV(op)->wait_detail);
|
||||
printf("Still waiting to flush journal offset %08jx\n", PRIV(op)->wait_detail);
|
||||
#endif
|
||||
return;
|
||||
}
|
||||
@@ -558,13 +553,14 @@ void blockstore_impl_t::process_list(blockstore_op_t *op)
|
||||
if (stable_count >= stable_alloc)
|
||||
{
|
||||
stable_alloc *= 2;
|
||||
stable = (obj_ver_id*)realloc(stable, sizeof(obj_ver_id) * stable_alloc);
|
||||
if (!stable)
|
||||
obj_ver_id* nst = (obj_ver_id*)realloc(stable, sizeof(obj_ver_id) * stable_alloc);
|
||||
if (!nst)
|
||||
{
|
||||
op->retval = -ENOMEM;
|
||||
FINISH_OP(op);
|
||||
return;
|
||||
}
|
||||
stable = nst;
|
||||
}
|
||||
stable[stable_count++] = {
|
||||
.oid = clean_it->first,
|
||||
@@ -642,8 +638,8 @@ void blockstore_impl_t::process_list(blockstore_op_t *op)
|
||||
if (stable_count >= stable_alloc)
|
||||
{
|
||||
stable_alloc += 32768;
|
||||
stable = (obj_ver_id*)realloc(stable, sizeof(obj_ver_id) * stable_alloc);
|
||||
if (!stable)
|
||||
obj_ver_id *nst = (obj_ver_id*)realloc(stable, sizeof(obj_ver_id) * stable_alloc);
|
||||
if (!nst)
|
||||
{
|
||||
if (unstable)
|
||||
free(unstable);
|
||||
@@ -651,6 +647,7 @@ void blockstore_impl_t::process_list(blockstore_op_t *op)
|
||||
FINISH_OP(op);
|
||||
return;
|
||||
}
|
||||
stable = nst;
|
||||
}
|
||||
stable[stable_count++] = dirty_it->first;
|
||||
}
|
||||
@@ -666,8 +663,8 @@ void blockstore_impl_t::process_list(blockstore_op_t *op)
|
||||
if (unstable_count >= unstable_alloc)
|
||||
{
|
||||
unstable_alloc += 32768;
|
||||
unstable = (obj_ver_id*)realloc(unstable, sizeof(obj_ver_id) * unstable_alloc);
|
||||
if (!unstable)
|
||||
obj_ver_id *nst = (obj_ver_id*)realloc(unstable, sizeof(obj_ver_id) * unstable_alloc);
|
||||
if (!nst)
|
||||
{
|
||||
if (stable)
|
||||
free(stable);
|
||||
@@ -675,6 +672,7 @@ void blockstore_impl_t::process_list(blockstore_op_t *op)
|
||||
FINISH_OP(op);
|
||||
return;
|
||||
}
|
||||
unstable = nst;
|
||||
}
|
||||
unstable[unstable_count++] = dirty_it->first;
|
||||
}
|
||||
@@ -694,8 +692,8 @@ void blockstore_impl_t::process_list(blockstore_op_t *op)
|
||||
if (stable_count+unstable_count > stable_alloc)
|
||||
{
|
||||
stable_alloc = stable_count+unstable_count;
|
||||
stable = (obj_ver_id*)realloc(stable, sizeof(obj_ver_id) * stable_alloc);
|
||||
if (!stable)
|
||||
obj_ver_id *nst = (obj_ver_id*)realloc(stable, sizeof(obj_ver_id) * stable_alloc);
|
||||
if (!nst)
|
||||
{
|
||||
if (unstable)
|
||||
free(unstable);
|
||||
@@ -703,6 +701,7 @@ void blockstore_impl_t::process_list(blockstore_op_t *op)
|
||||
FINISH_OP(op);
|
||||
return;
|
||||
}
|
||||
stable = nst;
|
||||
}
|
||||
// Copy unstable entries
|
||||
for (int i = 0; i < unstable_count; i++)
|
||||
|
@@ -55,6 +55,7 @@
|
||||
#define IS_JOURNAL(st) (((st) & 0x0F) == BS_ST_SMALL_WRITE)
|
||||
#define IS_BIG_WRITE(st) (((st) & 0x0F) == BS_ST_BIG_WRITE)
|
||||
#define IS_DELETE(st) (((st) & 0x0F) == BS_ST_DELETE)
|
||||
#define IS_INSTANT(st) (((st) & BS_ST_TYPE_MASK) == BS_ST_DELETE || ((st) & BS_ST_INSTANT))
|
||||
|
||||
#define BS_SUBMIT_CHECK_SQES(n) \
|
||||
if (ringloop->sqes_left() < (n))\
|
||||
@@ -201,7 +202,7 @@ struct blockstore_op_private_t
|
||||
{
|
||||
// Wait status
|
||||
int wait_for;
|
||||
uint64_t wait_detail;
|
||||
uint64_t wait_detail, wait_detail2;
|
||||
int pending_ops;
|
||||
int op_state;
|
||||
|
||||
@@ -252,6 +253,7 @@ class blockstore_impl_t
|
||||
bool inmemory_meta = false;
|
||||
// Maximum and minimum flusher count
|
||||
unsigned max_flusher_count, min_flusher_count;
|
||||
unsigned journal_trim_interval;
|
||||
// Maximum queue depth
|
||||
unsigned max_write_iodepth = 128;
|
||||
// Enable small (journaled) write throttling, useful for the SSD+HDD case
|
||||
@@ -277,6 +279,7 @@ class blockstore_impl_t
|
||||
int unsynced_big_write_count = 0, unstable_unsynced = 0;
|
||||
int unsynced_queued_ops = 0;
|
||||
allocator *data_alloc = NULL;
|
||||
uint64_t used_blocks = 0;
|
||||
uint8_t *zero_object;
|
||||
|
||||
void *metadata_buffer = NULL;
|
||||
@@ -376,7 +379,7 @@ class blockstore_impl_t
|
||||
// Stabilize
|
||||
int dequeue_stable(blockstore_op_t *op);
|
||||
int continue_stable(blockstore_op_t *op);
|
||||
void mark_stable(const obj_ver_id & ov, bool forget_dirty = false);
|
||||
void mark_stable(obj_ver_id ov, bool forget_dirty = false);
|
||||
void stabilize_object(object_id oid, uint64_t max_ver);
|
||||
blockstore_op_t* selective_sync(blockstore_op_t *op);
|
||||
int split_stab_op(blockstore_op_t *op, std::function<int(obj_ver_id v)> decider);
|
||||
@@ -430,7 +433,7 @@ public:
|
||||
|
||||
inline uint32_t get_block_size() { return dsk.data_block_size; }
|
||||
inline uint64_t get_block_count() { return dsk.block_count; }
|
||||
inline uint64_t get_free_block_count() { return data_alloc->get_free_count(); }
|
||||
inline uint64_t get_free_block_count() { return dsk.block_count - used_blocks; }
|
||||
inline uint32_t get_bitmap_granularity() { return dsk.disk_alignment; }
|
||||
inline uint64_t get_journal_size() { return dsk.journal_len; }
|
||||
};
|
||||
|
@@ -63,7 +63,7 @@ int blockstore_init_meta::loop()
|
||||
throw std::runtime_error("Failed to allocate metadata read buffer");
|
||||
// Read superblock
|
||||
GET_SQE();
|
||||
data->iov = { metadata_buffer, bs->dsk.meta_block_size };
|
||||
data->iov = { metadata_buffer, (size_t)bs->dsk.meta_block_size };
|
||||
data->callback = [this](ring_data_t *data) { handle_event(data, -1); };
|
||||
my_uring_prep_readv(sqe, bs->dsk.meta_fd, &data->iov, 1, bs->dsk.meta_offset);
|
||||
bs->ringloop->submit();
|
||||
@@ -100,7 +100,7 @@ resume_1:
|
||||
{
|
||||
printf("Initializing metadata area\n");
|
||||
GET_SQE();
|
||||
data->iov = (struct iovec){ metadata_buffer, bs->dsk.meta_block_size };
|
||||
data->iov = (struct iovec){ metadata_buffer, (size_t)bs->dsk.meta_block_size };
|
||||
data->callback = [this](ring_data_t *data) { handle_event(data, -1); };
|
||||
my_uring_prep_writev(sqe, bs->dsk.meta_fd, &data->iov, 1, bs->dsk.meta_offset);
|
||||
bs->ringloop->submit();
|
||||
@@ -153,7 +153,7 @@ resume_1:
|
||||
else if (hdr->version > BLOCKSTORE_META_FORMAT_V2)
|
||||
{
|
||||
printf(
|
||||
"Metadata format is too new for me (stored version is %lu, max supported %u).\n",
|
||||
"Metadata format is too new for me (stored version is %ju, max supported %u).\n",
|
||||
hdr->version, BLOCKSTORE_META_FORMAT_V2
|
||||
);
|
||||
exit(1);
|
||||
@@ -167,7 +167,7 @@ resume_1:
|
||||
printf(
|
||||
"Configuration stored in metadata superblock"
|
||||
" (meta_block_size=%u, data_block_size=%u, bitmap_granularity=%u, data_csum_type=%u, csum_block_size=%u)"
|
||||
" differs from OSD configuration (%lu/%u/%lu, %u/%u).\n",
|
||||
" differs from OSD configuration (%ju/%u/%ju, %u/%u).\n",
|
||||
hdr->meta_block_size, hdr->data_block_size, hdr->bitmap_granularity,
|
||||
hdr->data_csum_type, hdr->csum_block_size,
|
||||
bs->dsk.meta_block_size, bs->dsk.data_block_size, bs->dsk.bitmap_granularity,
|
||||
@@ -199,7 +199,8 @@ resume_2:
|
||||
submitted++;
|
||||
next_offset += bufs[i].size;
|
||||
GET_SQE();
|
||||
data->iov = { bufs[i].buf, bufs[i].size };
|
||||
assert(bufs[i].size <= 0x7fffffff);
|
||||
data->iov = { bufs[i].buf, (size_t)bufs[i].size };
|
||||
data->callback = [this, i](ring_data_t *data) { handle_event(data, i); };
|
||||
if (!zero_on_init)
|
||||
my_uring_prep_readv(sqe, bs->dsk.meta_fd, &data->iov, 1, bs->dsk.meta_offset + bufs[i].offset);
|
||||
@@ -231,7 +232,8 @@ resume_2:
|
||||
{
|
||||
// write the modified buffer back
|
||||
GET_SQE();
|
||||
data->iov = { bufs[i].buf, bufs[i].size };
|
||||
assert(bufs[i].size <= 0x7fffffff);
|
||||
data->iov = { bufs[i].buf, (size_t)bufs[i].size };
|
||||
data->callback = [this, i](ring_data_t *data) { handle_event(data, i); };
|
||||
my_uring_prep_writev(sqe, bs->dsk.meta_fd, &data->iov, 1, bs->dsk.meta_offset + bufs[i].offset);
|
||||
bufs[i].state = INIT_META_WRITING;
|
||||
@@ -257,7 +259,7 @@ resume_2:
|
||||
next_offset = entries_to_zero[i]/entries_per_block;
|
||||
for (j = i; j < entries_to_zero.size() && entries_to_zero[j]/entries_per_block == next_offset; j++) {}
|
||||
GET_SQE();
|
||||
data->iov = { metadata_buffer, bs->dsk.meta_block_size };
|
||||
data->iov = { metadata_buffer, (size_t)bs->dsk.meta_block_size };
|
||||
data->callback = [this](ring_data_t *data) { handle_event(data, -1); };
|
||||
my_uring_prep_readv(sqe, bs->dsk.meta_fd, &data->iov, 1, bs->dsk.meta_offset + (1+next_offset)*bs->dsk.meta_block_size);
|
||||
submitted++;
|
||||
@@ -273,7 +275,7 @@ resume_5:
|
||||
memset((uint8_t*)metadata_buffer + pos*bs->dsk.clean_entry_size, 0, bs->dsk.clean_entry_size);
|
||||
}
|
||||
GET_SQE();
|
||||
data->iov = { metadata_buffer, bs->dsk.meta_block_size };
|
||||
data->iov = { metadata_buffer, (size_t)bs->dsk.meta_block_size };
|
||||
data->callback = [this](ring_data_t *data) { handle_event(data, -1); };
|
||||
my_uring_prep_writev(sqe, bs->dsk.meta_fd, &data->iov, 1, bs->dsk.meta_offset + (1+next_offset)*bs->dsk.meta_block_size);
|
||||
submitted++;
|
||||
@@ -287,7 +289,7 @@ resume_6:
|
||||
entries_to_zero.clear();
|
||||
}
|
||||
// metadata read finished
|
||||
printf("Metadata entries loaded: %lu, free blocks: %lu / %lu\n", entries_loaded, bs->data_alloc->get_free_count(), bs->dsk.block_count);
|
||||
printf("Metadata entries loaded: %ju, free blocks: %ju / %ju\n", entries_loaded, bs->data_alloc->get_free_count(), bs->dsk.block_count);
|
||||
if (!bs->inmemory_meta)
|
||||
{
|
||||
free(metadata_buffer);
|
||||
@@ -328,7 +330,7 @@ bool blockstore_init_meta::handle_meta_block(uint8_t *buf, uint64_t entries_per_
|
||||
uint32_t *entry_csum = (uint32_t*)((uint8_t*)entry + bs->dsk.clean_entry_size - 4);
|
||||
if (*entry_csum != crc32c(0, entry, bs->dsk.clean_entry_size - 4))
|
||||
{
|
||||
printf("Metadata entry %lu is corrupt (checksum mismatch), skipping\n", done_cnt+i);
|
||||
printf("Metadata entry %ju is corrupt (checksum mismatch), skipping\n", done_cnt+i);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
@@ -366,7 +368,7 @@ bool blockstore_init_meta::handle_meta_block(uint8_t *buf, uint64_t entries_per_
|
||||
entries_to_zero.push_back(clean_it->second.location >> bs->dsk.block_order);
|
||||
}
|
||||
#ifdef BLOCKSTORE_DEBUG
|
||||
printf("Free block %lu from %lx:%lx v%lu (new location is %lu)\n",
|
||||
printf("Free block %ju from %jx:%jx v%ju (new location is %ju)\n",
|
||||
old_clean_loc,
|
||||
clean_it->first.inode, clean_it->first.stripe, clean_it->second.version,
|
||||
done_cnt+i);
|
||||
@@ -376,10 +378,11 @@ bool blockstore_init_meta::handle_meta_block(uint8_t *buf, uint64_t entries_per_
|
||||
else
|
||||
{
|
||||
bs->inode_space_stats[entry->oid.inode] += bs->dsk.data_block_size;
|
||||
bs->used_blocks++;
|
||||
}
|
||||
entries_loaded++;
|
||||
#ifdef BLOCKSTORE_DEBUG
|
||||
printf("Allocate block (clean entry) %lu: %lx:%lx v%lu\n", done_cnt+i, entry->oid.inode, entry->oid.stripe, entry->version);
|
||||
printf("Allocate block (clean entry) %ju: %jx:%jx v%ju\n", done_cnt+i, entry->oid.inode, entry->oid.stripe, entry->version);
|
||||
#endif
|
||||
bs->data_alloc->set(done_cnt+i, true);
|
||||
clean_db[entry->oid] = (struct clean_entry){
|
||||
@@ -393,7 +396,7 @@ bool blockstore_init_meta::handle_meta_block(uint8_t *buf, uint64_t entries_per_
|
||||
updated = true;
|
||||
memset(entry, 0, bs->dsk.clean_entry_size);
|
||||
#ifdef BLOCKSTORE_DEBUG
|
||||
printf("Old clean entry %lu: %lx:%lx v%lu\n", done_cnt+i, entry->oid.inode, entry->oid.stripe, entry->version);
|
||||
printf("Old clean entry %ju: %jx:%jx v%ju\n", done_cnt+i, entry->oid.inode, entry->oid.stripe, entry->version);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
@@ -465,7 +468,7 @@ int blockstore_init_journal::loop()
|
||||
if (!sqe)
|
||||
throw std::runtime_error("io_uring is full while trying to read journal");
|
||||
data = ((ring_data_t*)sqe->user_data);
|
||||
data->iov = { submitted_buf, bs->journal.block_size };
|
||||
data->iov = { submitted_buf, (size_t)bs->journal.block_size };
|
||||
data->callback = simple_callback;
|
||||
my_uring_prep_readv(sqe, bs->dsk.journal_fd, &data->iov, 1, bs->journal.offset);
|
||||
bs->ringloop->submit();
|
||||
@@ -506,7 +509,7 @@ resume_1:
|
||||
// FIXME: Randomize initial crc32. Track crc32 when trimming.
|
||||
printf("Resetting journal\n");
|
||||
GET_SQE();
|
||||
data->iov = (struct iovec){ submitted_buf, 2*bs->journal.block_size };
|
||||
data->iov = (struct iovec){ submitted_buf, (size_t)(2*bs->journal.block_size) };
|
||||
data->callback = simple_callback;
|
||||
my_uring_prep_writev(sqe, bs->dsk.journal_fd, &data->iov, 1, bs->journal.offset);
|
||||
wait_count++;
|
||||
@@ -556,7 +559,7 @@ resume_1:
|
||||
(je_start->version != JOURNAL_VERSION_V2 || je_start->size != JE_START_V2_SIZE && je_start->size != JE_START_V1_SIZE))
|
||||
{
|
||||
fprintf(
|
||||
stderr, "The code only supports journal versions 2 and 1, but it is %lu on disk."
|
||||
stderr, "The code only supports journal versions 2 and 1, but it is %ju on disk."
|
||||
" Please use vitastor-disk to rewrite the journal\n",
|
||||
je_start->size == JE_START_V0_SIZE ? 0 : je_start->version
|
||||
);
|
||||
@@ -605,7 +608,7 @@ resume_1:
|
||||
submitted_buf = (uint8_t*)bs->journal.buffer + journal_pos;
|
||||
data->iov = {
|
||||
submitted_buf,
|
||||
end - journal_pos < JOURNAL_BUFFER_SIZE ? end - journal_pos : JOURNAL_BUFFER_SIZE,
|
||||
(size_t)(end - journal_pos < JOURNAL_BUFFER_SIZE ? end - journal_pos : JOURNAL_BUFFER_SIZE),
|
||||
};
|
||||
data->callback = [this](ring_data_t *data1) { handle_event(data1); };
|
||||
my_uring_prep_readv(sqe, bs->dsk.journal_fd, &data->iov, 1, bs->journal.offset + journal_pos);
|
||||
@@ -621,7 +624,7 @@ resume_1:
|
||||
if (init_write_buf && !bs->readonly)
|
||||
{
|
||||
GET_SQE();
|
||||
data->iov = { init_write_buf, bs->journal.block_size };
|
||||
data->iov = { init_write_buf, (size_t)bs->journal.block_size };
|
||||
data->callback = simple_callback;
|
||||
my_uring_prep_writev(sqe, bs->dsk.journal_fd, &data->iov, 1, bs->journal.offset + init_write_sector);
|
||||
wait_count++;
|
||||
@@ -690,7 +693,7 @@ resume_1:
|
||||
IS_BIG_WRITE(dirty_it->second.state) &&
|
||||
dirty_it->second.location == UINT64_MAX)
|
||||
{
|
||||
printf("Fatal error (bug): %lx:%lx v%lu big_write journal_entry was allocated over another object\n",
|
||||
printf("Fatal error (bug): %jx:%jx v%ju big_write journal_entry was allocated over another object\n",
|
||||
dirty_it->first.oid.inode, dirty_it->first.oid.stripe, dirty_it->first.version);
|
||||
exit(1);
|
||||
}
|
||||
@@ -698,7 +701,7 @@ resume_1:
|
||||
bs->flusher->mark_trim_possible();
|
||||
bs->journal.dirty_start = bs->journal.next_free;
|
||||
printf(
|
||||
"Journal entries loaded: %lu, free journal space: %lu bytes (%08lx..%08lx is used), free blocks: %lu / %lu\n",
|
||||
"Journal entries loaded: %ju, free journal space: %ju bytes (%08jx..%08jx is used), free blocks: %ju / %ju\n",
|
||||
entries_loaded,
|
||||
(bs->journal.next_free >= bs->journal.used_start
|
||||
? bs->journal.len-bs->journal.block_size - (bs->journal.next_free-bs->journal.used_start)
|
||||
@@ -753,7 +756,7 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
|
||||
{
|
||||
#ifdef BLOCKSTORE_DEBUG
|
||||
printf(
|
||||
"je_small_write%s oid=%lx:%lx ver=%lu offset=%u len=%u\n",
|
||||
"je_small_write%s oid=%jx:%jx ver=%ju offset=%u len=%u\n",
|
||||
je->type == JE_SMALL_WRITE_INSTANT ? "_instant" : "",
|
||||
je->small_write.oid.inode, je->small_write.oid.stripe, je->small_write.version,
|
||||
je->small_write.offset, je->small_write.len
|
||||
@@ -775,7 +778,7 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
|
||||
if (location != je->small_write.data_offset)
|
||||
{
|
||||
char err[1024];
|
||||
snprintf(err, 1024, "BUG: calculated journal data offset (%08lx) != stored journal data offset (%08lx)", location, je->small_write.data_offset);
|
||||
snprintf(err, 1024, "BUG: calculated journal data offset (%08jx) != stored journal data offset (%08jx)", location, je->small_write.data_offset);
|
||||
throw std::runtime_error(err);
|
||||
}
|
||||
small_write_data.clear();
|
||||
@@ -802,7 +805,7 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
|
||||
covered += part_end - part_begin;
|
||||
small_write_data.push_back((iovec){
|
||||
.iov_base = (uint8_t*)done[i].buf + part_begin - done[i].pos,
|
||||
.iov_len = part_end - part_begin,
|
||||
.iov_len = (size_t)(part_end - part_begin),
|
||||
});
|
||||
}
|
||||
}
|
||||
@@ -825,7 +828,7 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
|
||||
if (!data_csum_valid)
|
||||
{
|
||||
printf(
|
||||
"Journal entry data is corrupt for small_write%s oid=%lx:%lx ver=%lu offset=%u len=%u - data crc32 %x != %x\n",
|
||||
"Journal entry data is corrupt for small_write%s oid=%jx:%jx ver=%ju offset=%u len=%u - data crc32 %x != %x\n",
|
||||
je->type == JE_SMALL_WRITE_INSTANT ? "_instant" : "",
|
||||
je->small_write.oid.inode, je->small_write.oid.stripe, je->small_write.version,
|
||||
je->small_write.offset, je->small_write.len,
|
||||
@@ -844,7 +847,7 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
|
||||
if (je->size != required_size)
|
||||
{
|
||||
printf(
|
||||
"Journal entry data has invalid size for small_write%s oid=%lx:%lx ver=%lu offset=%u len=%u - should be %u bytes but is %u bytes\n",
|
||||
"Journal entry data has invalid size for small_write%s oid=%jx:%jx ver=%ju offset=%u len=%u - should be %u bytes but is %u bytes\n",
|
||||
je->type == JE_SMALL_WRITE_INSTANT ? "_instant" : "",
|
||||
je->small_write.oid.inode, je->small_write.oid.stripe, je->small_write.version,
|
||||
je->small_write.offset, je->small_write.len,
|
||||
@@ -892,7 +895,7 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
|
||||
if (block_crc32 != *block_csums)
|
||||
{
|
||||
printf(
|
||||
"Journal entry data is corrupt for small_write%s oid=%lx:%lx ver=%lu offset=%u len=%u - block %u crc32 %x != %x\n",
|
||||
"Journal entry data is corrupt for small_write%s oid=%jx:%jx ver=%ju offset=%u len=%u - block %u crc32 %x != %x\n",
|
||||
je->type == JE_SMALL_WRITE_INSTANT ? "_instant" : "",
|
||||
je->small_write.oid.inode, je->small_write.oid.stripe, je->small_write.version,
|
||||
je->small_write.offset, je->small_write.len,
|
||||
@@ -955,7 +958,7 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
|
||||
bs->journal.used_sectors[proc_pos]++;
|
||||
#ifdef BLOCKSTORE_DEBUG
|
||||
printf(
|
||||
"journal offset %08lx is used by %lx:%lx v%lu (%lu refs)\n",
|
||||
"journal offset %08jx is used by %jx:%jx v%ju (%ju refs)\n",
|
||||
proc_pos, ov.oid.inode, ov.oid.stripe, ov.version, bs->journal.used_sectors[proc_pos]
|
||||
);
|
||||
#endif
|
||||
@@ -971,7 +974,7 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
|
||||
{
|
||||
#ifdef BLOCKSTORE_DEBUG
|
||||
printf(
|
||||
"je_big_write%s oid=%lx:%lx ver=%lu loc=%lu\n",
|
||||
"je_big_write%s oid=%jx:%jx ver=%ju loc=%ju\n",
|
||||
je->type == JE_BIG_WRITE_INSTANT ? "_instant" : "",
|
||||
je->big_write.oid.inode, je->big_write.oid.stripe, je->big_write.version, je->big_write.location >> bs->dsk.block_order
|
||||
);
|
||||
@@ -1048,7 +1051,7 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
|
||||
{
|
||||
#ifdef BLOCKSTORE_DEBUG
|
||||
printf(
|
||||
"Allocate block (journal) %lu: %lx:%lx v%lu\n",
|
||||
"Allocate block (journal) %ju: %jx:%jx v%ju\n",
|
||||
je->big_write.location >> bs->dsk.block_order,
|
||||
ov.oid.inode, ov.oid.stripe, ov.version
|
||||
);
|
||||
@@ -1058,7 +1061,7 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
|
||||
bs->journal.used_sectors[proc_pos]++;
|
||||
#ifdef BLOCKSTORE_DEBUG
|
||||
printf(
|
||||
"journal offset %08lx is used by %lx:%lx v%lu (%lu refs)\n",
|
||||
"journal offset %08jx is used by %jx:%jx v%ju (%ju refs)\n",
|
||||
proc_pos, ov.oid.inode, ov.oid.stripe, ov.version, bs->journal.used_sectors[proc_pos]
|
||||
);
|
||||
#endif
|
||||
@@ -1073,7 +1076,7 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
|
||||
else if (je->type == JE_STABLE)
|
||||
{
|
||||
#ifdef BLOCKSTORE_DEBUG
|
||||
printf("je_stable oid=%lx:%lx ver=%lu\n", je->stable.oid.inode, je->stable.oid.stripe, je->stable.version);
|
||||
printf("je_stable oid=%jx:%jx ver=%ju\n", je->stable.oid.inode, je->stable.oid.stripe, je->stable.version);
|
||||
#endif
|
||||
// oid, version
|
||||
obj_ver_id ov = {
|
||||
@@ -1085,7 +1088,7 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
|
||||
else if (je->type == JE_ROLLBACK)
|
||||
{
|
||||
#ifdef BLOCKSTORE_DEBUG
|
||||
printf("je_rollback oid=%lx:%lx ver=%lu\n", je->rollback.oid.inode, je->rollback.oid.stripe, je->rollback.version);
|
||||
printf("je_rollback oid=%jx:%jx ver=%ju\n", je->rollback.oid.inode, je->rollback.oid.stripe, je->rollback.version);
|
||||
#endif
|
||||
// rollback dirty writes of <oid> up to <version>
|
||||
obj_ver_id ov = {
|
||||
@@ -1097,7 +1100,7 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
|
||||
else if (je->type == JE_DELETE)
|
||||
{
|
||||
#ifdef BLOCKSTORE_DEBUG
|
||||
printf("je_delete oid=%lx:%lx ver=%lu\n", je->del.oid.inode, je->del.oid.stripe, je->del.version);
|
||||
printf("je_delete oid=%jx:%jx ver=%ju\n", je->del.oid.inode, je->del.oid.stripe, je->del.version);
|
||||
#endif
|
||||
bool dirty_exists = false;
|
||||
auto dirty_it = bs->dirty_db.upper_bound((obj_ver_id){
|
||||
@@ -1181,6 +1184,7 @@ void blockstore_init_journal::erase_dirty_object(blockstore_dirty_db_t::iterator
|
||||
sp -= bs->dsk.data_block_size;
|
||||
else
|
||||
bs->inode_space_stats.erase(oid.inode);
|
||||
bs->used_blocks--;
|
||||
}
|
||||
bs->erase_dirty(dirty_it, dirty_end, clean_loc);
|
||||
// Remove it from the flusher's queue, too
|
||||
|
@@ -90,8 +90,8 @@ int blockstore_journal_check_t::check_available(blockstore_op_t *op, int entries
|
||||
}
|
||||
// In fact, it's even more rare than "ran out of journal space", so print a warning
|
||||
printf(
|
||||
"Ran out of journal sector buffers: %d/%lu buffers used (%d dirty), next buffer (%ld)"
|
||||
" is %s and flushed %lu times. Consider increasing \'journal_sector_buffer_count\'\n",
|
||||
"Ran out of journal sector buffers: %d/%ju buffers used (%d dirty), next buffer (%jd)"
|
||||
" is %s and flushed %ju times. Consider increasing \'journal_sector_buffer_count\'\n",
|
||||
used, bs->journal.sector_count, dirty, next_sector,
|
||||
bs->journal.sector_info[next_sector].dirty ? "dirty" : "not dirty",
|
||||
bs->journal.sector_info[next_sector].flush_count
|
||||
@@ -103,7 +103,7 @@ int blockstore_journal_check_t::check_available(blockstore_op_t *op, int entries
|
||||
if (data_after > 0)
|
||||
{
|
||||
next_pos = next_pos + data_after;
|
||||
if (next_pos > bs->journal.len)
|
||||
if (next_pos >= bs->journal.len)
|
||||
{
|
||||
if (right_dir)
|
||||
next_pos = bs->journal.block_size + data_after;
|
||||
@@ -114,7 +114,7 @@ int blockstore_journal_check_t::check_available(blockstore_op_t *op, int entries
|
||||
{
|
||||
// No space in the journal. Wait until used_start changes.
|
||||
printf(
|
||||
"Ran out of journal space (used_start=%08lx, next_free=%08lx, dirty_start=%08lx)\n",
|
||||
"Ran out of journal space (used_start=%08jx, next_free=%08jx, dirty_start=%08jx)\n",
|
||||
bs->journal.used_start, bs->journal.next_free, bs->journal.dirty_start
|
||||
);
|
||||
PRIV(op)->wait_for = WAIT_JOURNAL;
|
||||
@@ -146,7 +146,7 @@ journal_entry* prefill_single_journal_entry(journal_t & journal, uint16_t type,
|
||||
journal.in_sector_pos = 0;
|
||||
auto next_next_free = (journal.next_free+journal.block_size) < journal.len ? journal.next_free + journal.block_size : journal.block_size;
|
||||
// double check that next_free doesn't cross used_start from the left
|
||||
assert(journal.next_free >= journal.used_start || next_next_free < journal.used_start);
|
||||
assert(journal.next_free >= journal.used_start && next_next_free >= journal.next_free || next_next_free < journal.used_start);
|
||||
journal.next_free = next_next_free;
|
||||
memset(journal.inmemory
|
||||
? (uint8_t*)journal.buffer + journal.sector_info[journal.cur_sector].offset
|
||||
@@ -183,7 +183,7 @@ void blockstore_impl_t::prepare_journal_sector_write(int cur_sector, blockstore_
|
||||
(journal.inmemory
|
||||
? (uint8_t*)journal.buffer + journal.sector_info[cur_sector].offset
|
||||
: (uint8_t*)journal.sector_buf + journal.block_size*cur_sector),
|
||||
journal.block_size
|
||||
(size_t)journal.block_size
|
||||
};
|
||||
data->callback = [this, flush_id = journal.submit_id](ring_data_t *data) { handle_journal_write(data, flush_id); };
|
||||
my_uring_prep_writev(
|
||||
@@ -263,7 +263,7 @@ uint64_t journal_t::get_trim_pos()
|
||||
// next_free does not need updating during trim
|
||||
#ifdef BLOCKSTORE_DEBUG
|
||||
printf(
|
||||
"Trimming journal (used_start=%08lx, next_free=%08lx, dirty_start=%08lx, new_start=%08lx, new_refcount=%ld)\n",
|
||||
"Trimming journal (used_start=%08jx, next_free=%08jx, dirty_start=%08jx, new_start=%08jx, new_refcount=%jd)\n",
|
||||
used_start, next_free, dirty_start,
|
||||
journal_used_it->first, journal_used_it->second
|
||||
);
|
||||
@@ -276,7 +276,7 @@ uint64_t journal_t::get_trim_pos()
|
||||
// Journal is cleared up to <journal_used_it>
|
||||
#ifdef BLOCKSTORE_DEBUG
|
||||
printf(
|
||||
"Trimming journal (used_start=%08lx, next_free=%08lx, dirty_start=%08lx, new_start=%08lx, new_refcount=%ld)\n",
|
||||
"Trimming journal (used_start=%08jx, next_free=%08jx, dirty_start=%08jx, new_start=%08jx, new_refcount=%jd)\n",
|
||||
used_start, next_free, dirty_start,
|
||||
journal_used_it->first, journal_used_it->second
|
||||
);
|
||||
@@ -296,7 +296,7 @@ void journal_t::dump_diagnostics()
|
||||
journal_used_it = used_sectors.begin();
|
||||
}
|
||||
printf(
|
||||
"Journal: used_start=%08lx next_free=%08lx dirty_start=%08lx trim_to=%08lx trim_to_refs=%ld\n",
|
||||
"Journal: used_start=%08jx next_free=%08jx dirty_start=%08jx trim_to=%08jx trim_to_refs=%jd\n",
|
||||
used_start, next_free, dirty_start,
|
||||
journal_used_it == used_sectors.end() ? 0 : journal_used_it->first,
|
||||
journal_used_it == used_sectors.end() ? 0 : journal_used_it->second
|
||||
|
@@ -13,13 +13,14 @@ void blockstore_impl_t::parse_config(blockstore_config_t & config, bool init)
|
||||
max_flusher_count = strtoull(config["flusher_count"].c_str(), NULL, 10);
|
||||
}
|
||||
min_flusher_count = strtoull(config["min_flusher_count"].c_str(), NULL, 10);
|
||||
journal_trim_interval = strtoull(config["journal_trim_interval"].c_str(), NULL, 10);
|
||||
max_write_iodepth = strtoull(config["max_write_iodepth"].c_str(), NULL, 10);
|
||||
throttle_small_writes = config["throttle_small_writes"] == "true" || config["throttle_small_writes"] == "1" || config["throttle_small_writes"] == "yes";
|
||||
throttle_target_iops = strtoull(config["throttle_target_iops"].c_str(), NULL, 10);
|
||||
throttle_target_mbs = strtoull(config["throttle_target_mbs"].c_str(), NULL, 10);
|
||||
throttle_target_parallelism = strtoull(config["throttle_target_parallelism"].c_str(), NULL, 10);
|
||||
throttle_threshold_us = strtoull(config["throttle_threshold_us"].c_str(), NULL, 10);
|
||||
if (config.find("autosync_writes") != config.end())
|
||||
if (config["autosync_writes"] != "")
|
||||
{
|
||||
autosync_writes = strtoull(config["autosync_writes"].c_str(), NULL, 10);
|
||||
}
|
||||
@@ -31,6 +32,10 @@ void blockstore_impl_t::parse_config(blockstore_config_t & config, bool init)
|
||||
{
|
||||
min_flusher_count = 1;
|
||||
}
|
||||
if (!journal_trim_interval)
|
||||
{
|
||||
journal_trim_interval = 512;
|
||||
}
|
||||
if (!max_write_iodepth)
|
||||
{
|
||||
max_write_iodepth = 128;
|
||||
|
@@ -25,7 +25,7 @@ int blockstore_impl_t::fulfill_read_push(blockstore_op_t *op, void *buf, uint64_
|
||||
return 1;
|
||||
}
|
||||
BS_SUBMIT_GET_SQE(sqe, data);
|
||||
data->iov = (struct iovec){ buf, len };
|
||||
data->iov = (struct iovec){ buf, (size_t)len };
|
||||
PRIV(op)->pending_ops++;
|
||||
my_uring_prep_readv(
|
||||
sqe,
|
||||
@@ -505,7 +505,7 @@ int blockstore_impl_t::dequeue_read(blockstore_op_t *read_op)
|
||||
for (auto & rv: PRIV(read_op)->read_vec)
|
||||
{
|
||||
if (rv.journal_sector)
|
||||
journal.used_sectors[rv.journal_sector-1]++;
|
||||
journal.used_sectors.at(rv.journal_sector-1)++;
|
||||
}
|
||||
}
|
||||
read_op->retval = 0;
|
||||
@@ -700,7 +700,7 @@ uint8_t* blockstore_impl_t::read_clean_meta_block(blockstore_op_t *op, uint64_t
|
||||
.buf = buf,
|
||||
});
|
||||
BS_SUBMIT_GET_SQE(sqe, data);
|
||||
data->iov = (struct iovec){ buf, dsk.meta_block_size };
|
||||
data->iov = (struct iovec){ buf, (size_t)dsk.meta_block_size };
|
||||
PRIV(op)->pending_ops++;
|
||||
my_uring_prep_readv(sqe, dsk.meta_fd, &data->iov, 1, dsk.meta_offset + dsk.meta_block_size + sector);
|
||||
data->callback = [this, op](ring_data_t *data) { handle_read_event(data, op); };
|
||||
@@ -855,7 +855,7 @@ void blockstore_impl_t::handle_read_event(ring_data_t *data, blockstore_op_t *op
|
||||
{
|
||||
ok = false;
|
||||
printf(
|
||||
"Checksum mismatch in object %lx:%lx v%lu in journal at 0x%lx, checksum block #%u: got %08x, expected %08x\n",
|
||||
"Checksum mismatch in object %jx:%jx v%ju in journal at 0x%jx, checksum block #%u: got %08x, expected %08x\n",
|
||||
op->oid.inode, op->oid.stripe, op->version,
|
||||
rv[i].disk_offset, bad_block / dsk.csum_block_size, calc_csum, stored_csum
|
||||
);
|
||||
@@ -875,7 +875,7 @@ void blockstore_impl_t::handle_read_event(ring_data_t *data, blockstore_op_t *op
|
||||
{
|
||||
ok = false;
|
||||
printf(
|
||||
"Checksum mismatch in object %lx:%lx v%lu in %s data at 0x%lx, checksum block #%u: got %08x, expected %08x\n",
|
||||
"Checksum mismatch in object %jx:%jx v%ju in %s data at 0x%jx, checksum block #%u: got %08x, expected %08x\n",
|
||||
op->oid.inode, op->oid.stripe, op->version,
|
||||
(rv[i].copy_flags & COPY_BUF_JOURNALED_BIG ? "redirect-write" : "clean"),
|
||||
rv[i].disk_offset, bad_block / dsk.csum_block_size, calc_csum, stored_csum
|
||||
@@ -918,7 +918,7 @@ void blockstore_impl_t::handle_read_event(ring_data_t *data, blockstore_op_t *op
|
||||
{
|
||||
// checksum error
|
||||
printf(
|
||||
"Checksum mismatch in object %lx:%lx v%lu in %s area at offset 0x%lx+0x%lx: %08x vs %08x\n",
|
||||
"Checksum mismatch in object %jx:%jx v%ju in %s area at offset 0x%jx+0x%zx: %08x vs %08x\n",
|
||||
op->oid.inode, op->oid.stripe, op->version,
|
||||
(vec.copy_flags & COPY_BUF_JOURNAL) ? "journal" : "data", vec.disk_offset, p,
|
||||
crc32c(0, (uint8_t*)op->buf + vec.offset - op->offset + p, dsk.csum_block_size), *csum
|
||||
@@ -966,7 +966,7 @@ void blockstore_impl_t::handle_read_event(ring_data_t *data, blockstore_op_t *op
|
||||
{
|
||||
if (rv.journal_sector)
|
||||
{
|
||||
auto used = --journal.used_sectors[rv.journal_sector-1];
|
||||
auto used = --journal.used_sectors.at(rv.journal_sector-1);
|
||||
if (used == 0)
|
||||
{
|
||||
journal.used_sectors.erase(rv.journal_sector-1);
|
||||
|
@@ -179,7 +179,7 @@ void blockstore_impl_t::erase_dirty(blockstore_dirty_db_t::iterator dirty_start,
|
||||
{
|
||||
object_id oid = dirty_it->first.oid;
|
||||
#ifdef BLOCKSTORE_DEBUG
|
||||
printf("Unblock writes-after-delete %lx:%lx v%lu\n", oid.inode, oid.stripe, dirty_it->first.version);
|
||||
printf("Unblock writes-after-delete %jx:%jx v%ju\n", oid.inode, oid.stripe, dirty_it->first.version);
|
||||
#endif
|
||||
dirty_it = dirty_end;
|
||||
// Unblock operations blocked by delete flushing
|
||||
@@ -210,21 +210,26 @@ void blockstore_impl_t::erase_dirty(blockstore_dirty_db_t::iterator dirty_start,
|
||||
dirty_it->second.location != UINT64_MAX)
|
||||
{
|
||||
#ifdef BLOCKSTORE_DEBUG
|
||||
printf("Free block %lu from %lx:%lx v%lu\n", dirty_it->second.location >> dsk.block_order,
|
||||
printf("Free block %ju from %jx:%jx v%ju\n", dirty_it->second.location >> dsk.block_order,
|
||||
dirty_it->first.oid.inode, dirty_it->first.oid.stripe, dirty_it->first.version);
|
||||
#endif
|
||||
data_alloc->set(dirty_it->second.location >> dsk.block_order, false);
|
||||
}
|
||||
auto used = --journal.used_sectors[dirty_it->second.journal_sector];
|
||||
auto used = --journal.used_sectors.at(dirty_it->second.journal_sector);
|
||||
#ifdef BLOCKSTORE_DEBUG
|
||||
printf(
|
||||
"remove usage of journal offset %08lx by %lx:%lx v%lu (%lu refs)\n", dirty_it->second.journal_sector,
|
||||
"remove usage of journal offset %08jx by %jx:%jx v%ju (%ju refs)\n", dirty_it->second.journal_sector,
|
||||
dirty_it->first.oid.inode, dirty_it->first.oid.stripe, dirty_it->first.version, used
|
||||
);
|
||||
#endif
|
||||
if (used == 0)
|
||||
{
|
||||
journal.used_sectors.erase(dirty_it->second.journal_sector);
|
||||
if (dirty_it->second.journal_sector == journal.sector_info[journal.cur_sector].offset)
|
||||
{
|
||||
// Mark current sector as "full" to select the new one
|
||||
journal.in_sector_pos = dsk.journal_block_size;
|
||||
}
|
||||
flusher->mark_trim_possible();
|
||||
}
|
||||
free_dirty_dyn_data(dirty_it->second);
|
||||
|
@@ -298,7 +298,7 @@ int blockstore_impl_t::dequeue_stable(blockstore_op_t *op)
|
||||
if (clean_it == clean_db.end() || clean_it->second.version < ov.version)
|
||||
{
|
||||
// No such object version
|
||||
printf("Error: %lx:%lx v%lu not found while stabilizing\n", ov.oid.inode, ov.oid.stripe, ov.version);
|
||||
printf("Error: %jx:%jx v%ju not found while stabilizing\n", ov.oid.inode, ov.oid.stripe, ov.version);
|
||||
return -ENOENT;
|
||||
}
|
||||
else
|
||||
@@ -307,35 +307,49 @@ int blockstore_impl_t::dequeue_stable(blockstore_op_t *op)
|
||||
return STAB_SPLIT_DONE;
|
||||
}
|
||||
}
|
||||
else if (IS_IN_FLIGHT(dirty_it->second.state))
|
||||
{
|
||||
// Object write is still in progress. Wait until the write request completes
|
||||
return STAB_SPLIT_WAIT;
|
||||
}
|
||||
else if (!IS_SYNCED(dirty_it->second.state))
|
||||
{
|
||||
// Object not synced yet - sync it
|
||||
// In previous versions we returned EBUSY here and required
|
||||
// the caller (OSD) to issue a global sync first. But a global sync
|
||||
// waits for all writes in the queue including inflight writes. And
|
||||
// inflight writes may themselves be blocked by unstable writes being
|
||||
// still present in the journal and not flushed away from it.
|
||||
// So we must sync specific objects here.
|
||||
//
|
||||
// Even more, we have to process "stabilize" request in parts. That is,
|
||||
// we must stabilize all objects which are already synced. Otherwise
|
||||
// they may block objects which are NOT synced yet.
|
||||
return STAB_SPLIT_SYNC;
|
||||
}
|
||||
else if (IS_STABLE(dirty_it->second.state))
|
||||
{
|
||||
// Already stable
|
||||
return STAB_SPLIT_DONE;
|
||||
}
|
||||
else
|
||||
while (true)
|
||||
{
|
||||
return STAB_SPLIT_TODO;
|
||||
if (IS_IN_FLIGHT(dirty_it->second.state))
|
||||
{
|
||||
// Object write is still in progress. Wait until the write request completes
|
||||
return STAB_SPLIT_WAIT;
|
||||
}
|
||||
else if (!IS_SYNCED(dirty_it->second.state))
|
||||
{
|
||||
// Object not synced yet - sync it
|
||||
// In previous versions we returned EBUSY here and required
|
||||
// the caller (OSD) to issue a global sync first. But a global sync
|
||||
// waits for all writes in the queue including inflight writes. And
|
||||
// inflight writes may themselves be blocked by unstable writes being
|
||||
// still present in the journal and not flushed away from it.
|
||||
// So we must sync specific objects here.
|
||||
//
|
||||
// Even more, we have to process "stabilize" request in parts. That is,
|
||||
// we must stabilize all objects which are already synced. Otherwise
|
||||
// they may block objects which are NOT synced yet.
|
||||
return STAB_SPLIT_SYNC;
|
||||
}
|
||||
else if (IS_STABLE(dirty_it->second.state))
|
||||
{
|
||||
break;
|
||||
}
|
||||
// Check previous versions too
|
||||
if (dirty_it == dirty_db.begin())
|
||||
{
|
||||
break;
|
||||
}
|
||||
dirty_it--;
|
||||
if (dirty_it->first.oid != ov.oid)
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
return STAB_SPLIT_TODO;
|
||||
});
|
||||
if (r != 1)
|
||||
{
|
||||
@@ -402,7 +416,7 @@ resume_4:
|
||||
{
|
||||
// Mark all dirty_db entries up to op->version as stable
|
||||
#ifdef BLOCKSTORE_DEBUG
|
||||
printf("Stabilize %lx:%lx v%lu\n", v->oid.inode, v->oid.stripe, v->version);
|
||||
printf("Stabilize %jx:%jx v%ju\n", v->oid.inode, v->oid.stripe, v->version);
|
||||
#endif
|
||||
mark_stable(*v);
|
||||
}
|
||||
@@ -412,11 +426,40 @@ resume_4:
|
||||
return 2;
|
||||
}
|
||||
|
||||
void blockstore_impl_t::mark_stable(const obj_ver_id & v, bool forget_dirty)
|
||||
void blockstore_impl_t::mark_stable(obj_ver_id v, bool forget_dirty)
|
||||
{
|
||||
auto dirty_it = dirty_db.find(v);
|
||||
if (dirty_it != dirty_db.end())
|
||||
{
|
||||
if (IS_INSTANT(dirty_it->second.state))
|
||||
{
|
||||
// 'Instant' (non-EC) operations may complete and try to become stable out of order. Prevent it.
|
||||
auto back_it = dirty_it;
|
||||
while (back_it != dirty_db.begin())
|
||||
{
|
||||
back_it--;
|
||||
if (back_it->first.oid != v.oid)
|
||||
{
|
||||
break;
|
||||
}
|
||||
if (!IS_STABLE(back_it->second.state))
|
||||
{
|
||||
// There are preceding unstable versions, can't flush <v>
|
||||
return;
|
||||
}
|
||||
}
|
||||
while (true)
|
||||
{
|
||||
dirty_it++;
|
||||
if (dirty_it == dirty_db.end() || dirty_it->first.oid != v.oid ||
|
||||
!IS_SYNCED(dirty_it->second.state))
|
||||
{
|
||||
dirty_it--;
|
||||
break;
|
||||
}
|
||||
v.version = dirty_it->first.version;
|
||||
}
|
||||
}
|
||||
while (1)
|
||||
{
|
||||
bool was_stable = IS_STABLE(dirty_it->second.state);
|
||||
@@ -445,6 +488,7 @@ void blockstore_impl_t::mark_stable(const obj_ver_id & v, bool forget_dirty)
|
||||
if (!exists)
|
||||
{
|
||||
inode_space_stats[dirty_it->first.oid.inode] += dsk.data_block_size;
|
||||
used_blocks++;
|
||||
}
|
||||
big_to_flush++;
|
||||
}
|
||||
@@ -455,6 +499,7 @@ void blockstore_impl_t::mark_stable(const obj_ver_id & v, bool forget_dirty)
|
||||
sp -= dsk.data_block_size;
|
||||
else
|
||||
inode_space_stats.erase(dirty_it->first.oid.inode);
|
||||
used_blocks--;
|
||||
big_to_flush++;
|
||||
}
|
||||
}
|
||||
@@ -462,7 +507,7 @@ void blockstore_impl_t::mark_stable(const obj_ver_id & v, bool forget_dirty)
|
||||
{
|
||||
// mark_stable should never be called for in-flight or submitted writes
|
||||
printf(
|
||||
"BUG: Attempt to mark_stable object %lx:%lx v%lu state of which is %x\n",
|
||||
"BUG: Attempt to mark_stable object %jx:%jx v%ju state of which is %x\n",
|
||||
dirty_it->first.oid.inode, dirty_it->first.oid.stripe, dirty_it->first.version,
|
||||
dirty_it->second.state
|
||||
);
|
||||
|
@@ -85,16 +85,14 @@ int blockstore_impl_t::continue_sync(blockstore_op_t *op)
|
||||
left--;
|
||||
auto & dirty_entry = dirty_db.at(sbw);
|
||||
uint64_t dyn_size = dsk.dirty_dyn_size(dirty_entry.offset, dirty_entry.len);
|
||||
if (!space_check.check_available(op, 1, sizeof(journal_entry_big_write) + dyn_size,
|
||||
(unstable_writes.size()+unstable_unsynced)*journal.block_size))
|
||||
if (!space_check.check_available(op, 1, sizeof(journal_entry_big_write) + dyn_size, 0))
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
else if (!space_check.check_available(op, PRIV(op)->sync_big_writes.size(),
|
||||
sizeof(journal_entry_big_write) + dsk.clean_entry_bitmap_size,
|
||||
(unstable_writes.size()+unstable_unsynced)*journal.block_size))
|
||||
sizeof(journal_entry_big_write) + dsk.clean_entry_bitmap_size, 0))
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
@@ -117,11 +115,14 @@ int blockstore_impl_t::continue_sync(blockstore_op_t *op)
|
||||
journal, (dirty_entry.state & BS_ST_INSTANT) ? JE_BIG_WRITE_INSTANT : JE_BIG_WRITE,
|
||||
sizeof(journal_entry_big_write) + dyn_size
|
||||
);
|
||||
dirty_entry.journal_sector = journal.sector_info[journal.cur_sector].offset;
|
||||
auto jsec = dirty_entry.journal_sector = journal.sector_info[journal.cur_sector].offset;
|
||||
assert(journal.next_free >= journal.used_start
|
||||
? (jsec >= journal.used_start && jsec < journal.next_free)
|
||||
: (jsec >= journal.used_start || jsec < journal.next_free));
|
||||
journal.used_sectors[journal.sector_info[journal.cur_sector].offset]++;
|
||||
#ifdef BLOCKSTORE_DEBUG
|
||||
printf(
|
||||
"journal offset %08lx is used by %lx:%lx v%lu (%lu refs)\n",
|
||||
"journal offset %08jx is used by %jx:%jx v%ju (%ju refs)\n",
|
||||
dirty_entry.journal_sector, it->oid.inode, it->oid.stripe, it->version,
|
||||
journal.used_sectors[journal.sector_info[journal.cur_sector].offset]
|
||||
);
|
||||
@@ -175,7 +176,7 @@ void blockstore_impl_t::ack_sync(blockstore_op_t *op)
|
||||
for (auto it = PRIV(op)->sync_big_writes.begin(); it != PRIV(op)->sync_big_writes.end(); it++)
|
||||
{
|
||||
#ifdef BLOCKSTORE_DEBUG
|
||||
printf("Ack sync big %lx:%lx v%lu\n", it->oid.inode, it->oid.stripe, it->version);
|
||||
printf("Ack sync big %jx:%jx v%ju\n", it->oid.inode, it->oid.stripe, it->version);
|
||||
#endif
|
||||
auto & unstab = unstable_writes[it->oid];
|
||||
unstab = unstab < it->version ? it->version : unstab;
|
||||
@@ -203,7 +204,7 @@ void blockstore_impl_t::ack_sync(blockstore_op_t *op)
|
||||
for (auto it = PRIV(op)->sync_small_writes.begin(); it != PRIV(op)->sync_small_writes.end(); it++)
|
||||
{
|
||||
#ifdef BLOCKSTORE_DEBUG
|
||||
printf("Ack sync small %lx:%lx v%lu\n", it->oid.inode, it->oid.stripe, it->version);
|
||||
printf("Ack sync small %jx:%jx v%ju\n", it->oid.inode, it->oid.stripe, it->version);
|
||||
#endif
|
||||
auto & unstab = unstable_writes[it->oid];
|
||||
unstab = unstab < it->version ? it->version : unstab;
|
||||
|
@@ -85,7 +85,7 @@ bool blockstore_impl_t::enqueue_write(blockstore_op_t *op)
|
||||
// It's allowed to write versions with low numbers over deletes
|
||||
// However, we have to flush those deletes first as we use version number for ordering
|
||||
#ifdef BLOCKSTORE_DEBUG
|
||||
printf("Write %lx:%lx v%lu over delete (real v%lu) offset=%u len=%u\n", op->oid.inode, op->oid.stripe, version, op->version, op->offset, op->len);
|
||||
printf("Write %jx:%jx v%ju over delete (real v%ju) offset=%u len=%u\n", op->oid.inode, op->oid.stripe, version, op->version, op->offset, op->len);
|
||||
#endif
|
||||
wait_del = true;
|
||||
PRIV(op)->real_version = op->version;
|
||||
@@ -95,11 +95,13 @@ bool blockstore_impl_t::enqueue_write(blockstore_op_t *op)
|
||||
// Issue an additional sync so the delete reaches the journal
|
||||
blockstore_op_t *sync_op = new blockstore_op_t;
|
||||
sync_op->opcode = BS_OP_SYNC;
|
||||
sync_op->callback = [this, op](blockstore_op_t *sync_op)
|
||||
sync_op->oid = op->oid;
|
||||
sync_op->version = op->version;
|
||||
sync_op->callback = [this](blockstore_op_t *sync_op)
|
||||
{
|
||||
flusher->unshift_flush((obj_ver_id){
|
||||
.oid = op->oid,
|
||||
.version = op->version-1,
|
||||
.oid = sync_op->oid,
|
||||
.version = sync_op->version-1,
|
||||
}, true);
|
||||
delete sync_op;
|
||||
};
|
||||
@@ -117,7 +119,7 @@ bool blockstore_impl_t::enqueue_write(blockstore_op_t *op)
|
||||
{
|
||||
// Invalid version requested
|
||||
#ifdef BLOCKSTORE_DEBUG
|
||||
printf("Write %lx:%lx v%lu requested, but we already have v%lu\n", op->oid.inode, op->oid.stripe, op->version, version);
|
||||
printf("Write %jx:%jx v%ju requested, but we already have v%ju\n", op->oid.inode, op->oid.stripe, op->version, version);
|
||||
#endif
|
||||
op->retval = -EEXIST;
|
||||
if (!is_del && alloc_dyn_data)
|
||||
@@ -129,7 +131,7 @@ bool blockstore_impl_t::enqueue_write(blockstore_op_t *op)
|
||||
}
|
||||
bool imm = (op->len < dsk.data_block_size ? (immediate_commit != IMMEDIATE_NONE) : (immediate_commit == IMMEDIATE_ALL));
|
||||
if (wait_big && !is_del && !deleted && op->len < dsk.data_block_size && !imm ||
|
||||
!imm && unsynced_queued_ops >= autosync_writes)
|
||||
!imm && autosync_writes && unsynced_queued_ops >= autosync_writes)
|
||||
{
|
||||
// Issue an additional sync so that the previous big write can reach the journal
|
||||
blockstore_op_t *sync_op = new blockstore_op_t;
|
||||
@@ -144,9 +146,9 @@ bool blockstore_impl_t::enqueue_write(blockstore_op_t *op)
|
||||
unsynced_queued_ops++;
|
||||
#ifdef BLOCKSTORE_DEBUG
|
||||
if (is_del)
|
||||
printf("Delete %lx:%lx v%lu\n", op->oid.inode, op->oid.stripe, op->version);
|
||||
printf("Delete %jx:%jx v%ju\n", op->oid.inode, op->oid.stripe, op->version);
|
||||
else if (!wait_del)
|
||||
printf("Write %lx:%lx v%lu offset=%u len=%u\n", op->oid.inode, op->oid.stripe, op->version, op->offset, op->len);
|
||||
printf("Write %jx:%jx v%ju offset=%u len=%u\n", op->oid.inode, op->oid.stripe, op->version, op->offset, op->len);
|
||||
#endif
|
||||
// No strict need to add it into dirty_db here except maybe for listings to return
|
||||
// correct data when there are inflight operations in the queue
|
||||
@@ -286,7 +288,7 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
|
||||
}
|
||||
// Restore original low version number for unblocked operations
|
||||
#ifdef BLOCKSTORE_DEBUG
|
||||
printf("Restoring %lx:%lx version: v%lu -> v%lu\n", op->oid.inode, op->oid.stripe, op->version, PRIV(op)->real_version);
|
||||
printf("Restoring %jx:%jx version: v%ju -> v%ju\n", op->oid.inode, op->oid.stripe, op->version, PRIV(op)->real_version);
|
||||
#endif
|
||||
auto prev_it = dirty_it;
|
||||
if (prev_it != dirty_db.begin())
|
||||
@@ -296,7 +298,7 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
|
||||
{
|
||||
// Original version is still invalid
|
||||
// All subsequent writes to the same object must be canceled too
|
||||
printf("Tried to write %lx:%lx v%lu after delete (old version v%lu), but already have v%lu\n",
|
||||
printf("Tried to write %jx:%jx v%ju after delete (old version v%ju), but already have v%ju\n",
|
||||
op->oid.inode, op->oid.stripe, PRIV(op)->real_version, op->version, prev_it->first.version);
|
||||
cancel_all_writes(op, dirty_it, -EEXIST);
|
||||
return 2;
|
||||
@@ -320,7 +322,7 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
|
||||
blockstore_journal_check_t space_check(this);
|
||||
if (!space_check.check_available(op, unsynced_big_write_count + 1,
|
||||
sizeof(journal_entry_big_write) + dsk.clean_dyn_size,
|
||||
(unstable_writes.size()+unstable_unsynced)*journal.block_size))
|
||||
(unstable_writes.size()+unstable_unsynced+((dirty_it->second.state & BS_ST_INSTANT) ? 0 : 1))*journal.block_size))
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
@@ -348,8 +350,8 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
|
||||
if (entry->oid.inode || entry->oid.stripe || entry->version)
|
||||
{
|
||||
printf(
|
||||
"Fatal error (metadata corruption or bug): tried to write object %lx:%lx v%lu"
|
||||
" over a non-zero metadata entry %lu with %lx:%lx v%lu\n", op->oid.inode,
|
||||
"Fatal error (metadata corruption or bug): tried to write object %jx:%jx v%ju"
|
||||
" over a non-zero metadata entry %ju with %jx:%jx v%ju\n", op->oid.inode,
|
||||
op->oid.stripe, op->version, loc, entry->oid.inode, entry->oid.stripe, entry->version
|
||||
);
|
||||
exit(1);
|
||||
@@ -361,7 +363,7 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
|
||||
dirty_it->second.state = (dirty_it->second.state & ~BS_ST_WORKFLOW_MASK) | BS_ST_SUBMITTED;
|
||||
#ifdef BLOCKSTORE_DEBUG
|
||||
printf(
|
||||
"Allocate block %lu for %lx:%lx v%lu\n",
|
||||
"Allocate block %ju for %jx:%jx v%ju\n",
|
||||
loc, op->oid.inode, op->oid.stripe, op->version
|
||||
);
|
||||
#endif
|
||||
@@ -372,13 +374,13 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
|
||||
int vcnt = 0;
|
||||
if (stripe_offset)
|
||||
{
|
||||
PRIV(op)->iov_zerofill[vcnt++] = (struct iovec){ zero_object, stripe_offset };
|
||||
PRIV(op)->iov_zerofill[vcnt++] = (struct iovec){ zero_object, (size_t)stripe_offset };
|
||||
}
|
||||
PRIV(op)->iov_zerofill[vcnt++] = (struct iovec){ op->buf, op->len };
|
||||
if (stripe_end)
|
||||
{
|
||||
stripe_end = dsk.bitmap_granularity - stripe_end;
|
||||
PRIV(op)->iov_zerofill[vcnt++] = (struct iovec){ zero_object, stripe_end };
|
||||
PRIV(op)->iov_zerofill[vcnt++] = (struct iovec){ zero_object, (size_t)stripe_end };
|
||||
}
|
||||
data->iov.iov_len = op->len + stripe_offset + stripe_end; // to check it in the callback
|
||||
data->callback = [this, op](ring_data_t *data) { handle_write_event(data, op); };
|
||||
@@ -412,7 +414,7 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
|
||||
sizeof(journal_entry_big_write) + dsk.clean_dyn_size, 0)
|
||||
|| !space_check.check_available(op, 1,
|
||||
sizeof(journal_entry_small_write) + dyn_size,
|
||||
op->len + (unstable_writes.size()+unstable_unsynced)*journal.block_size))
|
||||
op->len + (unstable_writes.size()+unstable_unsynced+((dirty_it->second.state & BS_ST_INSTANT) ? 0 : 1))*journal.block_size))
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
@@ -436,11 +438,23 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
|
||||
journal, op->opcode == BS_OP_WRITE_STABLE ? JE_SMALL_WRITE_INSTANT : JE_SMALL_WRITE,
|
||||
sizeof(journal_entry_small_write) + dyn_size
|
||||
);
|
||||
dirty_it->second.journal_sector = journal.sector_info[journal.cur_sector].offset;
|
||||
auto jsec = dirty_it->second.journal_sector = journal.sector_info[journal.cur_sector].offset;
|
||||
if (!(journal.next_free >= journal.used_start
|
||||
? (jsec >= journal.used_start && jsec < journal.next_free)
|
||||
: (jsec >= journal.used_start || jsec < journal.next_free)))
|
||||
{
|
||||
printf(
|
||||
"BUG: journal offset %08jx is used by %jx:%jx v%ju (%ju refs) BUT used_start=%jx next_free=%jx\n",
|
||||
dirty_it->second.journal_sector, dirty_it->first.oid.inode, dirty_it->first.oid.stripe, dirty_it->first.version,
|
||||
journal.used_sectors[journal.sector_info[journal.cur_sector].offset],
|
||||
journal.used_start, journal.next_free
|
||||
);
|
||||
abort();
|
||||
}
|
||||
journal.used_sectors[journal.sector_info[journal.cur_sector].offset]++;
|
||||
#ifdef BLOCKSTORE_DEBUG
|
||||
printf(
|
||||
"journal offset %08lx is used by %lx:%lx v%lu (%lu refs)\n",
|
||||
"journal offset %08jx is used by %jx:%jx v%ju (%ju refs)\n",
|
||||
dirty_it->second.journal_sector, dirty_it->first.oid.inode, dirty_it->first.oid.stripe, dirty_it->first.version,
|
||||
journal.used_sectors[journal.sector_info[journal.cur_sector].offset]
|
||||
);
|
||||
@@ -454,8 +468,8 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
|
||||
journal_used_it->first < next_next_free + op->len)
|
||||
{
|
||||
printf(
|
||||
"BUG: Attempt to overwrite used offset (%lx, %lu refs) of the journal with the object %lx:%lx v%lu: data at %lx, len %x!"
|
||||
" Journal used_start=%08lx (%lu refs), next_free=%08lx, dirty_start=%08lx\n",
|
||||
"BUG: Attempt to overwrite used offset (%jx, %ju refs) of the journal with the object %jx:%jx v%ju: data at %jx, len %x!"
|
||||
" Journal used_start=%08jx (%ju refs), next_free=%08jx, dirty_start=%08jx\n",
|
||||
journal_used_it->first, journal_used_it->second, op->oid.inode, op->oid.stripe, op->version, next_next_free, op->len,
|
||||
journal.used_start, journal.used_sectors[journal.used_start], journal.next_free, journal.dirty_start
|
||||
);
|
||||
@@ -463,7 +477,7 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
|
||||
}
|
||||
}
|
||||
// double check that next_free doesn't cross used_start from the left
|
||||
assert(journal.next_free >= journal.used_start || next_next_free < journal.used_start);
|
||||
assert(journal.next_free >= journal.used_start && next_next_free >= journal.next_free || next_next_free < journal.used_start);
|
||||
journal.next_free = next_next_free;
|
||||
je->oid = op->oid;
|
||||
je->version = op->version;
|
||||
@@ -505,7 +519,7 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
|
||||
if (next_next_free >= journal.len)
|
||||
next_next_free = dsk.journal_block_size;
|
||||
// double check that next_free doesn't cross used_start from the left
|
||||
assert(journal.next_free >= journal.used_start || next_next_free < journal.used_start);
|
||||
assert(journal.next_free >= journal.used_start && next_next_free >= journal.next_free || next_next_free < journal.used_start);
|
||||
journal.next_free = next_next_free;
|
||||
if (!(dirty_it->second.state & BS_ST_INSTANT))
|
||||
{
|
||||
@@ -549,7 +563,7 @@ resume_2:
|
||||
uint64_t dyn_size = dsk.dirty_dyn_size(op->offset, op->len);
|
||||
blockstore_journal_check_t space_check(this);
|
||||
if (!space_check.check_available(op, 1, sizeof(journal_entry_big_write) + dyn_size,
|
||||
(unstable_writes.size()+unstable_unsynced)*journal.block_size))
|
||||
(unstable_writes.size()+unstable_unsynced+((dirty_it->second.state & BS_ST_INSTANT) ? 0 : 1))*journal.block_size))
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
@@ -558,11 +572,23 @@ resume_2:
|
||||
journal, op->opcode == BS_OP_WRITE_STABLE ? JE_BIG_WRITE_INSTANT : JE_BIG_WRITE,
|
||||
sizeof(journal_entry_big_write) + dyn_size
|
||||
);
|
||||
dirty_it->second.journal_sector = journal.sector_info[journal.cur_sector].offset;
|
||||
auto jsec = dirty_it->second.journal_sector = journal.sector_info[journal.cur_sector].offset;
|
||||
if (!(journal.next_free >= journal.used_start
|
||||
? (jsec >= journal.used_start && jsec < journal.next_free)
|
||||
: (jsec >= journal.used_start || jsec < journal.next_free)))
|
||||
{
|
||||
printf(
|
||||
"BUG: journal offset %08jx is used by %jx:%jx v%ju (%ju refs) BUT used_start=%jx next_free=%jx\n",
|
||||
dirty_it->second.journal_sector, dirty_it->first.oid.inode, dirty_it->first.oid.stripe, dirty_it->first.version,
|
||||
journal.used_sectors[journal.sector_info[journal.cur_sector].offset],
|
||||
journal.used_start, journal.next_free
|
||||
);
|
||||
abort();
|
||||
}
|
||||
journal.used_sectors[journal.sector_info[journal.cur_sector].offset]++;
|
||||
#ifdef BLOCKSTORE_DEBUG
|
||||
printf(
|
||||
"journal offset %08lx is used by %lx:%lx v%lu (%lu refs)\n",
|
||||
"journal offset %08jx is used by %jx:%jx v%ju (%ju refs)\n",
|
||||
journal.sector_info[journal.cur_sector].offset, op->oid.inode, op->oid.stripe, op->version,
|
||||
journal.used_sectors[journal.sector_info[journal.cur_sector].offset]
|
||||
);
|
||||
@@ -589,11 +615,11 @@ resume_4:
|
||||
});
|
||||
assert(dirty_it != dirty_db.end());
|
||||
#ifdef BLOCKSTORE_DEBUG
|
||||
printf("Ack write %lx:%lx v%lu = state 0x%x\n", op->oid.inode, op->oid.stripe, op->version, dirty_it->second.state);
|
||||
printf("Ack write %jx:%jx v%ju = state 0x%x\n", op->oid.inode, op->oid.stripe, op->version, dirty_it->second.state);
|
||||
#endif
|
||||
bool is_big = (dirty_it->second.state & BS_ST_TYPE_MASK) == BS_ST_BIG_WRITE;
|
||||
bool imm = is_big ? (immediate_commit == IMMEDIATE_ALL) : (immediate_commit != IMMEDIATE_NONE);
|
||||
bool is_instant = ((dirty_it->second.state & BS_ST_TYPE_MASK) == BS_ST_DELETE || (dirty_it->second.state & BS_ST_INSTANT));
|
||||
bool is_instant = IS_INSTANT(dirty_it->second.state);
|
||||
if (imm)
|
||||
{
|
||||
auto & unstab = unstable_writes[op->oid];
|
||||
@@ -782,7 +808,7 @@ int blockstore_impl_t::dequeue_del(blockstore_op_t *op)
|
||||
journal.used_sectors[journal.sector_info[journal.cur_sector].offset]++;
|
||||
#ifdef BLOCKSTORE_DEBUG
|
||||
printf(
|
||||
"journal offset %08lx is used by %lx:%lx v%lu (%lu refs)\n",
|
||||
"journal offset %08jx is used by %jx:%jx v%ju (%ju refs)\n",
|
||||
dirty_it->second.journal_sector, dirty_it->first.oid.inode, dirty_it->first.oid.stripe, dirty_it->first.version,
|
||||
journal.used_sectors[journal.sector_info[journal.cur_sector].offset]
|
||||
);
|
||||
|
10
src/cli.cpp
10
src/cli.cpp
@@ -46,18 +46,21 @@ static const char* help_text =
|
||||
"vitastor-cli snap-create [-p|--pool <id|name>] <image>@<snapshot>\n"
|
||||
" Create a snapshot of image <name>. May be used live if only a single writer is active.\n"
|
||||
"\n"
|
||||
"vitastor-cli modify <name> [--rename <new-name>] [--resize <size>] [--readonly | --readwrite] [-f|--force]\n"
|
||||
"vitastor-cli modify <name> [--rename <new-name>] [--resize <size>] [--readonly | --readwrite] [-f|--force] [--down-ok]\n"
|
||||
" Rename, resize image or change its readonly status. Images with children can't be made read-write.\n"
|
||||
" If the new size is smaller than the old size, extra data will be purged.\n"
|
||||
" You should resize file system in the image, if present, before shrinking it.\n"
|
||||
" -f|--force Proceed with shrinking or setting readwrite flag even if the image has children.\n"
|
||||
" --down-ok Proceed with shrinking even if some data will be left on unavailable OSDs.\n"
|
||||
"\n"
|
||||
"vitastor-cli rm <from> [<to>] [--writers-stopped]\n"
|
||||
"vitastor-cli rm <from> [<to>] [--writers-stopped] [--down-ok]\n"
|
||||
" Remove <from> or all layers between <from> and <to> (<to> must be a child of <from>),\n"
|
||||
" rebasing all their children accordingly. --writers-stopped allows merging to be a bit\n"
|
||||
" more effective in case of a single 'slim' read-write child and 'fat' removed parent:\n"
|
||||
" the child is merged into parent and parent is renamed to child in that case.\n"
|
||||
" In other cases parent layers are always merged into children.\n"
|
||||
" Other options:\n"
|
||||
" --down-ok Continue deletion/merging even if some data will be left on unavailable OSDs.\n"
|
||||
"\n"
|
||||
"vitastor-cli flatten <layer>\n"
|
||||
" Flatten a layer, i.e. merge data and detach it from parents.\n"
|
||||
@@ -122,7 +125,7 @@ static const char* help_text =
|
||||
" --parallel_osds M Work with M osds in parallel when possible (default 4)\n"
|
||||
" --progress 1|0 Report progress (default 1)\n"
|
||||
" --cas 1|0 Use CAS writes for flatten, merge, rm (default is decide automatically)\n"
|
||||
" --no-color Disable colored output\n"
|
||||
" --color 1|0 Enable/disable colored output and CR symbols (default 1 if stdout is a terminal)\n"
|
||||
" --json JSON output\n"
|
||||
;
|
||||
|
||||
@@ -171,6 +174,7 @@ static json11::Json::object parse_args(int narg, const char *args[])
|
||||
!strcmp(opt, "readonly") || !strcmp(opt, "readwrite") ||
|
||||
!strcmp(opt, "force") || !strcmp(opt, "reverse") ||
|
||||
!strcmp(opt, "allow-data-loss") || !strcmp(opt, "allow_data_loss") ||
|
||||
!strcmp(opt, "down-ok") || !strcmp(opt, "down_ok") ||
|
||||
!strcmp(opt, "dry-run") || !strcmp(opt, "dry_run") ||
|
||||
!strcmp(opt, "help") || !strcmp(opt, "all") ||
|
||||
(!strcmp(opt, "writers-stopped") || !strcmp(opt, "writers_stopped")) && strcmp("1", args[i+1]) != 0
|
||||
|
@@ -77,7 +77,7 @@ struct alloc_osd_t
|
||||
std::string key = base64_decode(kv["key"].string_value());
|
||||
osd_num_t cur_osd;
|
||||
char null_byte = 0;
|
||||
int scanned = sscanf(key.c_str() + parent->cli->st_cli.etcd_prefix.length(), "/osd/stats/%lu%c", &cur_osd, &null_byte);
|
||||
int scanned = sscanf(key.c_str() + parent->cli->st_cli.etcd_prefix.length(), "/osd/stats/%ju%c", &cur_osd, &null_byte);
|
||||
if (scanned != 1 || !cur_osd)
|
||||
{
|
||||
fprintf(stderr, "Invalid key in etcd: %s\n", key.c_str());
|
||||
|
@@ -1,6 +1,7 @@
|
||||
// Copyright (c) Vitaliy Filippov, 2019+
|
||||
// License: VNPL-1.1 (see README.md for details)
|
||||
|
||||
#include <unistd.h>
|
||||
#include "str_util.h"
|
||||
#include "cluster_client.h"
|
||||
#include "cli.h"
|
||||
@@ -11,7 +12,7 @@ void cli_tool_t::change_parent(inode_t cur, inode_t new_parent, cli_result_t *re
|
||||
if (cur_cfg_it == cli->st_cli.inode_config.end())
|
||||
{
|
||||
char buf[128];
|
||||
snprintf(buf, 128, "Inode 0x%lx disappeared", cur);
|
||||
snprintf(buf, 128, "Inode 0x%jx disappeared", cur);
|
||||
*result = (cli_result_t){ .err = EIO, .text = buf };
|
||||
return;
|
||||
}
|
||||
@@ -113,7 +114,12 @@ void cli_tool_t::parse_config(json11::Json::object & cfg)
|
||||
else
|
||||
kv_it++;
|
||||
}
|
||||
color = !cfg["no_color"].bool_value();
|
||||
if (cfg.find("no_color") != cfg.end())
|
||||
color = !cfg["no_color"].bool_value();
|
||||
else if (cfg.find("color") != cfg.end())
|
||||
color = cfg["color"].bool_value();
|
||||
else
|
||||
color = isatty(1);
|
||||
json_output = cfg["json"].bool_value();
|
||||
iodepth = cfg["iodepth"].uint64_value();
|
||||
if (!iodepth)
|
||||
|
@@ -160,14 +160,14 @@ struct cli_describe_t
|
||||
if (op->reply.hdr.retval < 0)
|
||||
{
|
||||
fprintf(
|
||||
stderr, "Failed to describe objects on OSD %lu (retval=%ld)\n",
|
||||
stderr, "Failed to describe objects on OSD %ju (retval=%jd)\n",
|
||||
osd_num, op->reply.hdr.retval
|
||||
);
|
||||
}
|
||||
else if (op->reply.describe.result_bytes != op->reply.hdr.retval * sizeof(osd_reply_describe_item_t))
|
||||
{
|
||||
fprintf(
|
||||
stderr, "Invalid response size from OSD %lu (expected %lu bytes, got %lu bytes)\n",
|
||||
stderr, "Invalid response size from OSD %ju (expected %ju bytes, got %ju bytes)\n",
|
||||
osd_num, op->reply.hdr.retval * sizeof(osd_reply_describe_item_t), op->reply.describe.result_bytes
|
||||
);
|
||||
}
|
||||
@@ -178,11 +178,11 @@ struct cli_describe_t
|
||||
{
|
||||
if (!parent->json_output || parent->is_command_line)
|
||||
{
|
||||
#define FMT "{\"inode\":\"0x%lx\",\"stripe\":\"0x%lx\",\"part\":%u,\"osd_num\":%lu%s%s%s}"
|
||||
#define FMT "{\"inode\":\"0x%jx\",\"stripe\":\"0x%jx\",\"part\":%u,\"osd_num\":%ju%s%s%s}"
|
||||
printf(
|
||||
(parent->json_output
|
||||
? (count > 0 ? ",\n " FMT : " " FMT)
|
||||
: "%lx:%lx part %u on OSD %lu%s%s%s\n"),
|
||||
: "%jx:%jx part %u on OSD %ju%s%s%s\n"),
|
||||
#undef FMT
|
||||
items[i].inode, items[i].stripe,
|
||||
items[i].role, items[i].osd_num,
|
||||
|
@@ -82,7 +82,7 @@ resume_1:
|
||||
// osd ID
|
||||
osd_num_t osd_num;
|
||||
char null_byte = 0;
|
||||
int scanned = sscanf(kv.key.substr(parent->cli->st_cli.etcd_prefix.length()).c_str(), "/osd/stats/%lu%c", &osd_num, &null_byte);
|
||||
int scanned = sscanf(kv.key.substr(parent->cli->st_cli.etcd_prefix.length()).c_str(), "/osd/stats/%ju%c", &osd_num, &null_byte);
|
||||
if (scanned != 1 || !osd_num || osd_num >= POOL_ID_MAX)
|
||||
{
|
||||
fprintf(stderr, "Invalid key in etcd: %s\n", kv.key.c_str());
|
||||
|
@@ -136,7 +136,7 @@ struct cli_fix_t
|
||||
auto pool_cfg_it = parent->cli->st_cli.pool_config.find(INODE_POOL(obj.inode));
|
||||
if (pool_cfg_it == parent->cli->st_cli.pool_config.end())
|
||||
{
|
||||
fprintf(stderr, "Object %lx:%lx is from unknown pool\n", obj.inode, obj.stripe);
|
||||
fprintf(stderr, "Object %jx:%jx is from unknown pool\n", obj.inode, obj.stripe);
|
||||
continue;
|
||||
}
|
||||
auto & pool_cfg = pool_cfg_it->second;
|
||||
@@ -146,7 +146,7 @@ struct cli_fix_t
|
||||
!pg_it->second.cur_primary || !(pg_it->second.cur_state & PG_ACTIVE))
|
||||
{
|
||||
fprintf(
|
||||
stderr, "Object %lx:%lx is from PG %u/%u which is not currently active\n",
|
||||
stderr, "Object %jx:%jx is from PG %u/%u which is not currently active\n",
|
||||
obj.inode, obj.stripe, pool_cfg_it->first, pg_num
|
||||
);
|
||||
continue;
|
||||
@@ -171,7 +171,7 @@ struct cli_fix_t
|
||||
{
|
||||
if (op->reply.hdr.retval < 0 || op->reply.describe.result_bytes != op->reply.hdr.retval * sizeof(osd_reply_describe_item_t))
|
||||
{
|
||||
fprintf(stderr, "Failed to describe objects on OSD %lu (retval=%ld)\n", primary_osd, op->reply.hdr.retval);
|
||||
fprintf(stderr, "Failed to describe objects on OSD %ju (retval=%jd)\n", primary_osd, op->reply.hdr.retval);
|
||||
parent->waiting--;
|
||||
loop();
|
||||
}
|
||||
@@ -209,7 +209,7 @@ struct cli_fix_t
|
||||
if (rm_op->reply.hdr.retval < 0)
|
||||
{
|
||||
fprintf(
|
||||
stderr, "Failed to remove object %lx:%lx from OSD %lu (retval=%ld)\n",
|
||||
stderr, "Failed to remove object %jx:%jx from OSD %ju (retval=%jd)\n",
|
||||
rm_op->req.sec_del.oid.inode, rm_op->req.sec_del.oid.stripe,
|
||||
rm_osd_num, rm_op->reply.hdr.retval
|
||||
);
|
||||
@@ -226,7 +226,7 @@ struct cli_fix_t
|
||||
else
|
||||
{
|
||||
printf(
|
||||
"Removed %lx:%lx (part %lu) from OSD %lu\n",
|
||||
"Removed %jx:%jx (part %ju) from OSD %ju\n",
|
||||
rm_op->req.sec_del.oid.inode, rm_op->req.sec_del.oid.stripe & ~STRIPE_MASK,
|
||||
rm_op->req.sec_del.oid.stripe & STRIPE_MASK, rm_osd_num
|
||||
);
|
||||
@@ -254,7 +254,7 @@ struct cli_fix_t
|
||||
if (scrub_op->reply.hdr.retval < 0 && scrub_op->reply.hdr.retval != -ENOENT)
|
||||
{
|
||||
fprintf(
|
||||
stderr, "Failed to scrub %lx:%lx on OSD %lu (retval=%ld)\n",
|
||||
stderr, "Failed to scrub %jx:%jx on OSD %ju (retval=%jd)\n",
|
||||
obj.inode, obj.stripe, primary_osd, scrub_op->reply.hdr.retval
|
||||
);
|
||||
}
|
||||
|
@@ -150,7 +150,7 @@ resume_1:
|
||||
inode_t only_inode_num;
|
||||
char null_byte = 0;
|
||||
int scanned = sscanf(kv.key.substr(parent->cli->st_cli.etcd_prefix.length()).c_str(),
|
||||
"/inode/stats/%u/%lu%c", &pool_id, &only_inode_num, &null_byte);
|
||||
"/inode/stats/%u/%ju%c", &pool_id, &only_inode_num, &null_byte);
|
||||
if (scanned != 2 || !pool_id || pool_id >= POOL_ID_MAX || INODE_POOL(only_inode_num) != 0)
|
||||
{
|
||||
fprintf(stderr, "Invalid key in etcd: %s\n", kv.key.c_str());
|
||||
@@ -456,7 +456,7 @@ std::string format_lat(uint64_t lat)
|
||||
char buf[256];
|
||||
int l = 0;
|
||||
if (lat < 100)
|
||||
l = snprintf(buf, sizeof(buf), "%lu us", lat);
|
||||
l = snprintf(buf, sizeof(buf), "%ju us", lat);
|
||||
else if (lat < 500000)
|
||||
l = snprintf(buf, sizeof(buf), "%.2f ms", (double)lat/1000);
|
||||
else
|
||||
|
@@ -202,7 +202,7 @@ struct snap_merger_t
|
||||
if (parent->progress)
|
||||
{
|
||||
printf(
|
||||
"Merging %ld layer(s) into target %s%s (inode %lu in pool %u)\n",
|
||||
"Merging %zd layer(s) into target %s%s (inode %ju in pool %u)\n",
|
||||
sources.size(), target_cfg->name.c_str(),
|
||||
use_cas ? " online (with CAS)" : "", INODE_NO_POOL(target), INODE_POOL(target)
|
||||
);
|
||||
@@ -275,7 +275,9 @@ struct snap_merger_t
|
||||
processed++;
|
||||
if (parent->progress && !(processed % 128))
|
||||
{
|
||||
printf("\rFiltering target blocks: %lu/%lu", processed, to_process);
|
||||
fprintf(stderr, parent->color
|
||||
? "\rFiltering target blocks: %ju/%ju"
|
||||
: "Filtering target blocks: %ju/%ju\n", processed, to_process);
|
||||
}
|
||||
}
|
||||
if (in_flight > 0 || oit != merge_offsets.end())
|
||||
@@ -285,7 +287,9 @@ struct snap_merger_t
|
||||
}
|
||||
if (parent->progress)
|
||||
{
|
||||
printf("\r%lu full blocks of target filtered out\n", to_process-merge_offsets.size());
|
||||
fprintf(stderr, parent->color
|
||||
? "\r%ju full blocks of target filtered out\n"
|
||||
: "%ju full blocks of target filtered out\n", to_process-merge_offsets.size());
|
||||
}
|
||||
}
|
||||
state = 3;
|
||||
@@ -320,7 +324,9 @@ struct snap_merger_t
|
||||
processed++;
|
||||
if (parent->progress && !(processed % 128))
|
||||
{
|
||||
printf("\rOverwriting blocks: %lu/%lu", processed, to_process);
|
||||
fprintf(stderr, parent->color
|
||||
? "\rOverwriting blocks: %ju/%ju"
|
||||
: "Overwriting blocks: %ju/%ju\n", processed, to_process);
|
||||
}
|
||||
}
|
||||
if (in_flight == 0 && rwo_error.size())
|
||||
@@ -339,7 +345,9 @@ struct snap_merger_t
|
||||
}
|
||||
if (parent->progress)
|
||||
{
|
||||
printf("\rOverwriting blocks: %lu/%lu\n", to_process, to_process);
|
||||
fprintf(stderr, parent->color
|
||||
? "\rOverwriting blocks: %ju/%ju\n"
|
||||
: "Overwriting blocks: %ju/%ju\n", to_process, to_process);
|
||||
}
|
||||
// Done
|
||||
result = (cli_result_t){ .text = "Done, layers from "+from_name+" to "+to_name+" merged into "+target_name };
|
||||
@@ -384,7 +392,7 @@ struct snap_merger_t
|
||||
auto & name = parent->cli->st_cli.inode_config.at(src).name;
|
||||
if (parent->progress)
|
||||
{
|
||||
printf("Got listing of layer %s (inode %lu in pool %u)\n", name.c_str(), INODE_NO_POOL(src), INODE_POOL(src));
|
||||
printf("Got listing of layer %s (inode %ju in pool %u)\n", name.c_str(), INODE_NO_POOL(src), INODE_POOL(src));
|
||||
}
|
||||
if (delete_source)
|
||||
{
|
||||
@@ -416,7 +424,7 @@ struct snap_merger_t
|
||||
{
|
||||
if (op->retval < 0)
|
||||
{
|
||||
fprintf(stderr, "error reading target bitmap at offset %lx: %s\n", op->offset, strerror(-op->retval));
|
||||
fprintf(stderr, "error reading target bitmap at offset %jx: %s\n", op->offset, strerror(-op->retval));
|
||||
}
|
||||
else
|
||||
{
|
||||
@@ -571,7 +579,7 @@ struct snap_merger_t
|
||||
{
|
||||
if (subop->retval != 0)
|
||||
{
|
||||
fprintf(stderr, "error deleting from layer 0x%lx at offset %lx: %s", subop->inode, subop->offset, strerror(-subop->retval));
|
||||
fprintf(stderr, "error deleting from layer 0x%jx at offset %jx: %s", subop->inode, subop->offset, strerror(-subop->retval));
|
||||
}
|
||||
delete subop;
|
||||
};
|
||||
@@ -620,7 +628,7 @@ struct snap_merger_t
|
||||
if (rwo->error_code)
|
||||
{
|
||||
char buf[1024];
|
||||
snprintf(buf, 1024, "Error %s target at offset %lx: %s",
|
||||
snprintf(buf, 1024, "Error %s target at offset %jx: %s",
|
||||
rwo->error_read ? "reading" : "writing", rwo->error_offset, strerror(rwo->error_code));
|
||||
rwo_error = std::string(buf);
|
||||
}
|
||||
|
@@ -15,6 +15,7 @@ struct image_changer_t
|
||||
uint64_t new_size = 0;
|
||||
bool force_size = false, inc_size = false;
|
||||
bool set_readonly = false, set_readwrite = false, force = false;
|
||||
bool down_ok = false;
|
||||
// interval between fsyncs
|
||||
int fsync_interval = 128;
|
||||
|
||||
@@ -105,6 +106,7 @@ struct image_changer_t
|
||||
{ "pool", (uint64_t)INODE_POOL(inode_num) },
|
||||
{ "fsync-interval", fsync_interval },
|
||||
{ "min-offset", ((new_size+4095)/4096)*4096 },
|
||||
{ "down-ok", down_ok },
|
||||
});
|
||||
resume_1:
|
||||
while (!cb(result))
|
||||
@@ -240,6 +242,7 @@ std::function<bool(cli_result_t &)> cli_tool_t::start_modify(json11::Json cfg)
|
||||
changer->fsync_interval = cfg["fsync_interval"].uint64_value();
|
||||
if (!changer->fsync_interval)
|
||||
changer->fsync_interval = 128;
|
||||
changer->down_ok = cfg["down_ok"].bool_value();
|
||||
// FIXME Check that the image doesn't have children when shrinking
|
||||
return [changer](cli_result_t & result)
|
||||
{
|
||||
|
@@ -53,6 +53,8 @@ struct snap_remover_t
|
||||
int use_cas = 1;
|
||||
// interval between fsyncs
|
||||
int fsync_interval = 128;
|
||||
// ignore deletion errors
|
||||
bool down_ok = false;
|
||||
|
||||
std::map<inode_t,int> sources;
|
||||
std::map<inode_t,uint64_t> inode_used;
|
||||
@@ -291,7 +293,7 @@ resume_100:
|
||||
if (it == parent->cli->st_cli.inode_config.end())
|
||||
{
|
||||
char buf[1024];
|
||||
snprintf(buf, 1024, "Parent inode of layer %s (id 0x%lx) not found", cur->name.c_str(), cur->parent_id);
|
||||
snprintf(buf, 1024, "Parent inode of layer %s (id 0x%jx) not found", cur->name.c_str(), cur->parent_id);
|
||||
state = 100;
|
||||
return;
|
||||
}
|
||||
@@ -384,7 +386,7 @@ resume_100:
|
||||
pool_id_t pool_id = 0;
|
||||
inode_t inode = 0;
|
||||
char null_byte = 0;
|
||||
int scanned = sscanf(kv.key.c_str() + parent->cli->st_cli.etcd_prefix.length()+13, "%u/%lu%c", &pool_id, &inode, &null_byte);
|
||||
int scanned = sscanf(kv.key.c_str() + parent->cli->st_cli.etcd_prefix.length()+13, "%u/%ju%c", &pool_id, &inode, &null_byte);
|
||||
if (scanned != 2 || !inode)
|
||||
{
|
||||
result = (cli_result_t){ .err = EIO, .text = "Bad key returned from etcd: "+kv.key };
|
||||
@@ -439,7 +441,7 @@ resume_100:
|
||||
if (child_it == parent->cli->st_cli.inode_config.end())
|
||||
{
|
||||
char buf[1024];
|
||||
snprintf(buf, 1024, "Inode 0x%lx disappeared", inverse_child);
|
||||
snprintf(buf, 1024, "Inode 0x%jx disappeared", inverse_child);
|
||||
result = (cli_result_t){ .err = EIO, .text = std::string(buf) };
|
||||
state = 100;
|
||||
return;
|
||||
@@ -448,7 +450,7 @@ resume_100:
|
||||
if (target_it == parent->cli->st_cli.inode_config.end())
|
||||
{
|
||||
char buf[1024];
|
||||
snprintf(buf, 1024, "Inode 0x%lx disappeared", inverse_parent);
|
||||
snprintf(buf, 1024, "Inode 0x%jx disappeared", inverse_parent);
|
||||
result = (cli_result_t){ .err = EIO, .text = std::string(buf) };
|
||||
state = 100;
|
||||
return;
|
||||
@@ -576,7 +578,7 @@ resume_100:
|
||||
if (cur_cfg_it == parent->cli->st_cli.inode_config.end())
|
||||
{
|
||||
char buf[1024];
|
||||
snprintf(buf, 1024, "Inode 0x%lx disappeared", cur);
|
||||
snprintf(buf, 1024, "Inode 0x%jx disappeared", cur);
|
||||
result = (cli_result_t){ .err = EIO, .text = std::string(buf) };
|
||||
state = 100;
|
||||
return;
|
||||
@@ -640,7 +642,7 @@ resume_100:
|
||||
if (child_it == parent->cli->st_cli.inode_config.end())
|
||||
{
|
||||
char buf[1024];
|
||||
snprintf(buf, 1024, "Inode 0x%lx disappeared", child_inode);
|
||||
snprintf(buf, 1024, "Inode 0x%jx disappeared", child_inode);
|
||||
result = (cli_result_t){ .err = EIO, .text = std::string(buf) };
|
||||
state = 100;
|
||||
return;
|
||||
@@ -649,7 +651,7 @@ resume_100:
|
||||
if (target_it == parent->cli->st_cli.inode_config.end())
|
||||
{
|
||||
char buf[1024];
|
||||
snprintf(buf, 1024, "Inode 0x%lx disappeared", target_inode);
|
||||
snprintf(buf, 1024, "Inode 0x%jx disappeared", target_inode);
|
||||
result = (cli_result_t){ .err = EIO, .text = std::string(buf) };
|
||||
state = 100;
|
||||
return;
|
||||
@@ -670,7 +672,7 @@ resume_100:
|
||||
if (source == parent->cli->st_cli.inode_config.end())
|
||||
{
|
||||
char buf[1024];
|
||||
snprintf(buf, 1024, "Inode 0x%lx disappeared", inode);
|
||||
snprintf(buf, 1024, "Inode 0x%jx disappeared", inode);
|
||||
result = (cli_result_t){ .err = EIO, .text = std::string(buf) };
|
||||
state = 100;
|
||||
return;
|
||||
@@ -679,6 +681,7 @@ resume_100:
|
||||
{ "inode", inode },
|
||||
{ "pool", (uint64_t)INODE_POOL(inode) },
|
||||
{ "fsync-interval", fsync_interval },
|
||||
{ "down-ok", down_ok },
|
||||
});
|
||||
}
|
||||
};
|
||||
@@ -690,6 +693,7 @@ std::function<bool(cli_result_t &)> cli_tool_t::start_rm(json11::Json cfg)
|
||||
snap_remover->from_name = cfg["from"].string_value();
|
||||
snap_remover->to_name = cfg["to"].string_value();
|
||||
snap_remover->fsync_interval = cfg["fsync_interval"].uint64_value();
|
||||
snap_remover->down_ok = cfg["down_ok"].bool_value();
|
||||
if (!snap_remover->fsync_interval)
|
||||
snap_remover->fsync_interval = 128;
|
||||
if (!cfg["cas"].is_null())
|
||||
|
@@ -17,6 +17,7 @@ struct rm_pg_t
|
||||
uint64_t obj_count = 0, obj_done = 0;
|
||||
int state = 0;
|
||||
int in_flight = 0;
|
||||
bool synced = false;
|
||||
};
|
||||
|
||||
struct rm_inode_t
|
||||
@@ -24,6 +25,7 @@ struct rm_inode_t
|
||||
uint64_t inode = 0;
|
||||
pool_id_t pool_id = 0;
|
||||
uint64_t min_offset = 0;
|
||||
bool down_ok = false;
|
||||
|
||||
cli_tool_t *parent = NULL;
|
||||
inode_list_t *lister = NULL;
|
||||
@@ -48,6 +50,7 @@ struct rm_inode_t
|
||||
.objects = objects,
|
||||
.obj_count = objects.size(),
|
||||
.obj_done = 0,
|
||||
.synced = parent->cli->get_immediate_commit(inode),
|
||||
});
|
||||
if (min_offset == 0)
|
||||
{
|
||||
@@ -93,7 +96,7 @@ struct rm_inode_t
|
||||
fprintf(stderr, "Some data may remain after delete on OSDs which are currently down: ");
|
||||
for (int i = 0; i < inactive_osds.size(); i++)
|
||||
{
|
||||
fprintf(stderr, i > 0 ? ", %lu" : "%lu", inactive_osds[i]);
|
||||
fprintf(stderr, i > 0 ? ", %ju" : "%ju", inactive_osds[i]);
|
||||
}
|
||||
fprintf(stderr, "\n");
|
||||
}
|
||||
@@ -136,7 +139,7 @@ struct rm_inode_t
|
||||
cur_list->in_flight--;
|
||||
if (op->reply.hdr.retval < 0)
|
||||
{
|
||||
fprintf(stderr, "Failed to remove object %lx:%lx from PG %u (OSD %lu) (retval=%ld)\n",
|
||||
fprintf(stderr, "Failed to remove object %jx:%jx from PG %u (OSD %ju) (retval=%jd)\n",
|
||||
op->req.rw.inode, op->req.rw.offset,
|
||||
cur_list->pg_num, cur_list->rm_osd_num, op->reply.hdr.retval);
|
||||
error_count++;
|
||||
@@ -151,6 +154,37 @@ struct rm_inode_t
|
||||
}
|
||||
cur_list->obj_pos++;
|
||||
}
|
||||
if (cur_list->in_flight == 0 && cur_list->obj_pos == cur_list->objects.end() &&
|
||||
!cur_list->synced)
|
||||
{
|
||||
osd_op_t *op = new osd_op_t();
|
||||
op->op_type = OSD_OP_OUT;
|
||||
op->peer_fd = parent->cli->msgr.osd_peer_fds.at(cur_list->rm_osd_num);
|
||||
op->req = (osd_any_op_t){
|
||||
.sync = {
|
||||
.header = {
|
||||
.magic = SECONDARY_OSD_OP_MAGIC,
|
||||
.id = parent->cli->next_op_id(),
|
||||
.opcode = OSD_OP_SYNC,
|
||||
},
|
||||
},
|
||||
};
|
||||
op->callback = [this, cur_list](osd_op_t *op)
|
||||
{
|
||||
cur_list->in_flight--;
|
||||
cur_list->synced = true;
|
||||
if (op->reply.hdr.retval < 0)
|
||||
{
|
||||
fprintf(stderr, "Failed to sync OSD %ju (retval=%jd)\n",
|
||||
cur_list->rm_osd_num, op->reply.hdr.retval);
|
||||
error_count++;
|
||||
}
|
||||
delete op;
|
||||
continue_delete();
|
||||
};
|
||||
cur_list->in_flight++;
|
||||
parent->cli->msgr.outbox_push(op);
|
||||
}
|
||||
}
|
||||
|
||||
void continue_delete()
|
||||
@@ -161,7 +195,8 @@ struct rm_inode_t
|
||||
}
|
||||
for (int i = 0; i < lists.size(); i++)
|
||||
{
|
||||
if (!lists[i]->in_flight && lists[i]->obj_pos == lists[i]->objects.end())
|
||||
if (!lists[i]->in_flight && lists[i]->obj_pos == lists[i]->objects.end() &&
|
||||
lists[i]->synced)
|
||||
{
|
||||
delete lists[i];
|
||||
lists.erase(lists.begin()+i, lists.begin()+i+1);
|
||||
@@ -178,7 +213,9 @@ struct rm_inode_t
|
||||
}
|
||||
if (parent->progress && total_count > 0 && total_done*1000/total_count != total_prev_pct)
|
||||
{
|
||||
fprintf(stderr, "\rRemoved %lu/%lu objects, %lu more PGs to list...", total_done, total_count, pgs_to_list);
|
||||
fprintf(stderr, parent->color
|
||||
? "\rRemoved %ju/%ju objects, %ju more PGs to list..."
|
||||
: "Removed %ju/%ju objects, %ju more PGs to list...\n", total_done, total_count, pgs_to_list);
|
||||
total_prev_pct = total_done*1000/total_count;
|
||||
}
|
||||
if (lists_done && !lists.size())
|
||||
@@ -187,17 +224,18 @@ struct rm_inode_t
|
||||
{
|
||||
fprintf(stderr, "\n");
|
||||
}
|
||||
if (parent->progress && (total_done < total_count || inactive_osds.size() > 0))
|
||||
bool is_error = (total_done < total_count || inactive_osds.size() > 0 || error_count > 0);
|
||||
if (parent->progress && is_error)
|
||||
{
|
||||
fprintf(
|
||||
stderr, "Warning: Pool:%u,ID:%lu inode data may not have been fully removed.\n"
|
||||
" Use `vitastor-cli rm-data --pool %u --inode %lu` if you encounter it in listings.\n",
|
||||
stderr, "Warning: Pool:%u,ID:%ju inode data may not have been fully removed.\n"
|
||||
"Use `vitastor-cli rm-data --pool %u --inode %ju` if you encounter it in listings.\n",
|
||||
pool_id, INODE_NO_POOL(inode), pool_id, INODE_NO_POOL(inode)
|
||||
);
|
||||
}
|
||||
result = (cli_result_t){
|
||||
.err = error_count > 0 ? EIO : 0,
|
||||
.text = error_count > 0 ? "Some blocks were not removed" : (
|
||||
.err = is_error && !down_ok ? EIO : 0,
|
||||
.text = is_error ? "Some blocks were not removed" : (
|
||||
"Done, inode "+std::to_string(INODE_NO_POOL(inode))+" from pool "+
|
||||
std::to_string(pool_id)+" removed"),
|
||||
.data = json11::Json::object {
|
||||
@@ -246,6 +284,7 @@ std::function<bool(cli_result_t &)> cli_tool_t::start_rm_data(json11::Json cfg)
|
||||
{
|
||||
remover->inode = (remover->inode & (((uint64_t)1 << (64-POOL_ID_BITS)) - 1)) | (((uint64_t)remover->pool_id) << (64-POOL_ID_BITS));
|
||||
}
|
||||
remover->down_ok = cfg["down_ok"].bool_value();
|
||||
remover->pool_id = INODE_POOL(remover->inode);
|
||||
remover->min_offset = cfg["min_offset"].uint64_value();
|
||||
return [remover](cli_result_t & result)
|
||||
|
@@ -106,7 +106,7 @@ resume_2:
|
||||
if (etcd_states[i]["error"].is_null())
|
||||
{
|
||||
etcd_alive++;
|
||||
etcd_db_size = etcd_states[i]["dbSizeInUse"].uint64_value();
|
||||
etcd_db_size = etcd_states[i]["dbSize"].uint64_value();
|
||||
}
|
||||
}
|
||||
int mon_count = 0;
|
||||
@@ -132,7 +132,7 @@ resume_2:
|
||||
auto kv = parent->cli->st_cli.parse_etcd_kv(osd_stats[i]);
|
||||
osd_num_t stat_osd_num = 0;
|
||||
char null_byte = 0;
|
||||
int scanned = sscanf(kv.key.c_str() + parent->cli->st_cli.etcd_prefix.size(), "/osd/stats/%lu%c", &stat_osd_num, &null_byte);
|
||||
int scanned = sscanf(kv.key.c_str() + parent->cli->st_cli.etcd_prefix.size(), "/osd/stats/%ju%c", &stat_osd_num, &null_byte);
|
||||
if (scanned != 1 || !stat_osd_num)
|
||||
{
|
||||
fprintf(stderr, "Invalid key in etcd: %s\n", kv.key.c_str());
|
||||
@@ -283,7 +283,7 @@ resume_2:
|
||||
}
|
||||
printf(
|
||||
" cluster:\n"
|
||||
" etcd: %d / %ld up, %s database size\n"
|
||||
" etcd: %d / %zd up, %s database size\n"
|
||||
" mon: %d up%s\n"
|
||||
" osd: %d / %d up\n"
|
||||
" \n"
|
||||
|
@@ -265,7 +265,7 @@ void cluster_client_t::erase_op(cluster_op_t *op)
|
||||
}
|
||||
}
|
||||
|
||||
void cluster_client_t::continue_ops(bool up_retry)
|
||||
void cluster_client_t::continue_ops(int time_passed)
|
||||
{
|
||||
if (!pgs_loaded)
|
||||
{
|
||||
@@ -277,22 +277,27 @@ void cluster_client_t::continue_ops(bool up_retry)
|
||||
// Attempt to reenter the function
|
||||
return;
|
||||
}
|
||||
int reset_duration = 0;
|
||||
restart:
|
||||
continuing_ops = 1;
|
||||
for (auto op = op_queue_head; op; )
|
||||
{
|
||||
cluster_op_t *next_op = op->next;
|
||||
if (!op->up_wait || up_retry)
|
||||
if (op->retry_after && time_passed)
|
||||
{
|
||||
op->up_wait = false;
|
||||
if (!op->prev_wait)
|
||||
op->retry_after = op->retry_after > time_passed ? op->retry_after-time_passed : 0;
|
||||
if (op->retry_after && (!reset_duration || op->retry_after < reset_duration))
|
||||
{
|
||||
if (op->opcode == OSD_OP_SYNC)
|
||||
continue_sync(op);
|
||||
else
|
||||
continue_rw(op);
|
||||
reset_duration = op->retry_after;
|
||||
}
|
||||
}
|
||||
if (!op->retry_after && !op->prev_wait)
|
||||
{
|
||||
if (op->opcode == OSD_OP_SYNC)
|
||||
continue_sync(op);
|
||||
else
|
||||
continue_rw(op);
|
||||
}
|
||||
op = next_op;
|
||||
if (continuing_ops == 2)
|
||||
{
|
||||
@@ -300,6 +305,27 @@ restart:
|
||||
}
|
||||
}
|
||||
continuing_ops = 0;
|
||||
reset_retry_timer(reset_duration);
|
||||
}
|
||||
|
||||
void cluster_client_t::reset_retry_timer(int new_duration)
|
||||
{
|
||||
if (retry_timeout_duration && retry_timeout_duration <= new_duration || !new_duration)
|
||||
{
|
||||
return;
|
||||
}
|
||||
if (retry_timeout_id)
|
||||
{
|
||||
tfd->clear_timer(retry_timeout_id);
|
||||
}
|
||||
retry_timeout_duration = new_duration;
|
||||
retry_timeout_id = tfd->set_timer(retry_timeout_duration, false, [this](int)
|
||||
{
|
||||
int time_passed = retry_timeout_duration;
|
||||
retry_timeout_id = 0;
|
||||
retry_timeout_duration = 0;
|
||||
continue_ops(time_passed);
|
||||
});
|
||||
}
|
||||
|
||||
void cluster_client_t::on_load_config_hook(json11::Json::object & etcd_global_config)
|
||||
@@ -349,16 +375,28 @@ void cluster_client_t::on_load_config_hook(json11::Json::object & etcd_global_co
|
||||
{
|
||||
client_max_writeback_iodepth = DEFAULT_CLIENT_MAX_WRITEBACK_IODEPTH;
|
||||
}
|
||||
// up_wait_retry_interval
|
||||
up_wait_retry_interval = config["up_wait_retry_interval"].uint64_value();
|
||||
if (!up_wait_retry_interval)
|
||||
// client_retry_interval
|
||||
client_retry_interval = config["client_retry_interval"].uint64_value();
|
||||
if (!client_retry_interval)
|
||||
{
|
||||
up_wait_retry_interval = 500;
|
||||
client_retry_interval = 50;
|
||||
}
|
||||
else if (up_wait_retry_interval < 50)
|
||||
else if (client_retry_interval < 10)
|
||||
{
|
||||
up_wait_retry_interval = 50;
|
||||
client_retry_interval = 10;
|
||||
}
|
||||
// client_eio_retry_interval
|
||||
client_eio_retry_interval = 1000;
|
||||
if (!config["client_eio_retry_interval"].is_null())
|
||||
{
|
||||
client_eio_retry_interval = config["client_eio_retry_interval"].uint64_value();
|
||||
if (client_eio_retry_interval && client_eio_retry_interval < 10)
|
||||
{
|
||||
client_eio_retry_interval = 10;
|
||||
}
|
||||
}
|
||||
// log_level
|
||||
log_level = config["log_level"].uint64_value();
|
||||
msgr.parse_config(config);
|
||||
st_cli.parse_config(config);
|
||||
st_cli.load_pgs();
|
||||
@@ -703,6 +741,8 @@ resume_1:
|
||||
}
|
||||
goto resume_2;
|
||||
}
|
||||
// Protect from try_send completing the operation immediately
|
||||
op->inflight_count++;
|
||||
for (int i = 0; i < op->parts.size(); i++)
|
||||
{
|
||||
if (!(op->parts[i].flags & PART_SENT))
|
||||
@@ -712,22 +752,17 @@ resume_1:
|
||||
// We'll need to retry again
|
||||
if (op->parts[i].flags & PART_RETRY)
|
||||
{
|
||||
op->up_wait = true;
|
||||
if (!retry_timeout_id)
|
||||
{
|
||||
retry_timeout_id = tfd->set_timer(up_wait_retry_interval, false, [this](int)
|
||||
{
|
||||
retry_timeout_id = 0;
|
||||
continue_ops(true);
|
||||
});
|
||||
}
|
||||
op->retry_after = client_retry_interval;
|
||||
reset_retry_timer(client_retry_interval);
|
||||
}
|
||||
op->state = 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
op->inflight_count--;
|
||||
if (op->state == 1)
|
||||
{
|
||||
// Some suboperations have to be resent
|
||||
return 0;
|
||||
}
|
||||
resume_2:
|
||||
@@ -774,10 +809,9 @@ resume_2:
|
||||
return 1;
|
||||
}
|
||||
else if (op->retval != 0 && !(op->flags & OP_FLUSH_BUFFER) &&
|
||||
op->retval != -EPIPE && op->retval != -EIO && op->retval != -ENOSPC)
|
||||
op->retval != -EPIPE && (op->retval != -EIO || !client_eio_retry_interval) && op->retval != -ENOSPC)
|
||||
{
|
||||
// Fatal error (neither -EPIPE, -EIO nor -ENOSPC)
|
||||
// FIXME: Add a parameter to allow to not wait for EIOs (incomplete or corrupted objects) to heal
|
||||
erase_op(op);
|
||||
return 1;
|
||||
}
|
||||
@@ -1147,31 +1181,30 @@ void cluster_client_t::handle_op_part(cluster_op_part_t *part)
|
||||
if (op->retval != -EINTR && op->retval != -EIO && op->retval != -ENOSPC)
|
||||
{
|
||||
stop_fd = part->op.peer_fd;
|
||||
fprintf(
|
||||
stderr, "%s operation failed on OSD %lu: retval=%ld (expected %d), dropping connection\n",
|
||||
osd_op_names[part->op.req.hdr.opcode], part->osd_num, part->op.reply.hdr.retval, expected
|
||||
);
|
||||
if (op->retval != -EPIPE || log_level > 0)
|
||||
{
|
||||
fprintf(
|
||||
stderr, "%s operation failed on OSD %ju: retval=%jd (expected %d), dropping connection\n",
|
||||
osd_op_names[part->op.req.hdr.opcode], part->osd_num, part->op.reply.hdr.retval, expected
|
||||
);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
fprintf(
|
||||
stderr, "%s operation failed on OSD %lu: retval=%ld (expected %d)\n",
|
||||
stderr, "%s operation failed on OSD %ju: retval=%jd (expected %d)\n",
|
||||
osd_op_names[part->op.req.hdr.opcode], part->osd_num, part->op.reply.hdr.retval, expected
|
||||
);
|
||||
}
|
||||
// All next things like timer, continue_sync/rw and stop_client may affect the operation again
|
||||
// So do all these things after modifying operation state, otherwise we may hit reenterability bugs
|
||||
// FIXME postpone such things to set_immediate here to avoid bugs
|
||||
// Mark op->up_wait = true to retry operation after a short pause (not immediately)
|
||||
op->up_wait = true;
|
||||
if (!retry_timeout_id)
|
||||
// Set op->retry_after to retry operation after a short pause (not immediately)
|
||||
if (!op->retry_after)
|
||||
{
|
||||
retry_timeout_id = tfd->set_timer(up_wait_retry_interval, false, [this](int)
|
||||
{
|
||||
retry_timeout_id = 0;
|
||||
continue_ops(true);
|
||||
});
|
||||
op->retry_after = op->retval == -EIO ? client_eio_retry_interval : client_retry_interval;
|
||||
}
|
||||
reset_retry_timer(op->retry_after);
|
||||
if (op->inflight_count == 0)
|
||||
{
|
||||
if (op->opcode == OSD_OP_SYNC)
|
||||
|
@@ -59,7 +59,7 @@ protected:
|
||||
void *buf = NULL;
|
||||
cluster_op_t *orig_op = NULL;
|
||||
bool needs_reslice = false;
|
||||
bool up_wait = false;
|
||||
int retry_after = 0;
|
||||
int inflight_count = 0, done_count = 0;
|
||||
std::vector<cluster_op_part_t> parts;
|
||||
void *part_bitmaps = NULL;
|
||||
@@ -91,10 +91,12 @@ class cluster_client_t
|
||||
uint64_t client_max_buffered_ops = 0;
|
||||
uint64_t client_max_writeback_iodepth = 0;
|
||||
|
||||
int log_level;
|
||||
int up_wait_retry_interval = 500; // ms
|
||||
int log_level = 0;
|
||||
int client_retry_interval = 50; // ms
|
||||
int client_eio_retry_interval = 1000; // ms
|
||||
|
||||
int retry_timeout_id = 0;
|
||||
int retry_timeout_duration = 0;
|
||||
std::vector<cluster_op_t*> offline_ops;
|
||||
cluster_op_t *op_queue_head = NULL, *op_queue_tail = NULL;
|
||||
writeback_cache_t *wb = NULL;
|
||||
@@ -131,7 +133,7 @@ public:
|
||||
|
||||
bool get_immediate_commit(uint64_t inode);
|
||||
|
||||
void continue_ops(bool up_retry = false);
|
||||
void continue_ops(int time_passed = 0);
|
||||
inode_list_t *list_inode_start(inode_t inode,
|
||||
std::function<void(inode_list_t* lst, std::set<object_id>&& objects, pg_num_t pg_num, osd_num_t primary_osd, int status)> callback);
|
||||
int list_pg_count(inode_list_t *lst);
|
||||
@@ -152,6 +154,7 @@ protected:
|
||||
int continue_rw(cluster_op_t *op);
|
||||
bool check_rw(cluster_op_t *op);
|
||||
void slice_rw(cluster_op_t *op);
|
||||
void reset_retry_timer(int new_duration);
|
||||
bool try_send(cluster_op_t *op, int i);
|
||||
int continue_sync(cluster_op_t *op);
|
||||
void send_sync(cluster_op_t *op, cluster_op_part_t *part);
|
||||
|
@@ -226,7 +226,7 @@ void cluster_client_t::send_list(inode_list_osd_t *cur_list)
|
||||
{
|
||||
if (op->reply.hdr.retval < 0)
|
||||
{
|
||||
fprintf(stderr, "Failed to get PG %u/%u object list from OSD %lu (retval=%ld), skipping\n",
|
||||
fprintf(stderr, "Failed to get PG %u/%u object list from OSD %ju (retval=%jd), skipping\n",
|
||||
cur_list->pg->lst->pool_id, cur_list->pg->pg_num, cur_list->osd_num, op->reply.hdr.retval);
|
||||
}
|
||||
else
|
||||
@@ -236,7 +236,7 @@ void cluster_client_t::send_list(inode_list_osd_t *cur_list)
|
||||
// Unstable objects, if present, mean that someone still writes into the inode. Warn the user about it.
|
||||
cur_list->pg->has_unstable = true;
|
||||
fprintf(
|
||||
stderr, "[PG %u/%u] Inode still has %lu unstable object versions out of total %lu - is it still open?\n",
|
||||
stderr, "[PG %u/%u] Inode still has %ju unstable object versions out of total %ju - is it still open?\n",
|
||||
cur_list->pg->lst->pool_id, cur_list->pg->pg_num, op->reply.hdr.retval - op->reply.sec_list.stable_count,
|
||||
op->reply.hdr.retval
|
||||
);
|
||||
@@ -244,7 +244,7 @@ void cluster_client_t::send_list(inode_list_osd_t *cur_list)
|
||||
if (log_level > 0)
|
||||
{
|
||||
fprintf(
|
||||
stderr, "[PG %u/%u] Got inode object list from OSD %lu: %ld object versions\n",
|
||||
stderr, "[PG %u/%u] Got inode object list from OSD %ju: %jd object versions\n",
|
||||
cur_list->pg->lst->pool_id, cur_list->pg->pg_num, cur_list->osd_num, op->reply.hdr.retval
|
||||
);
|
||||
}
|
||||
|
@@ -47,7 +47,7 @@ void disk_tool_simple_offsets(json11::Json cfg, bool json_output)
|
||||
if (!bitmap_granularity)
|
||||
bitmap_granularity = DEFAULT_BITMAP_GRANULARITY;
|
||||
if (!journal_size)
|
||||
journal_size = 16*1024*1024;
|
||||
journal_size = 32*1024*1024;
|
||||
if (!device_block_size)
|
||||
device_block_size = 4096;
|
||||
if (!data_csum_type)
|
||||
@@ -75,9 +75,9 @@ void disk_tool_simple_offsets(json11::Json cfg, bool json_output)
|
||||
if (st.st_blksize < device_block_size)
|
||||
{
|
||||
fprintf(
|
||||
stderr, "Warning: %s reports %lu byte blocks, but we use %lu."
|
||||
" Set --device_block_size=%lu if you're sure it works well with %lu byte blocks.\n",
|
||||
device.c_str(), st.st_blksize, device_block_size, st.st_blksize, st.st_blksize
|
||||
stderr, "Warning: %s reports %ju byte blocks, but we use %ju."
|
||||
" Set --device_block_size=%ju if you're sure it works well with %ju byte blocks.\n",
|
||||
device.c_str(), (uint64_t)st.st_blksize, device_block_size, (uint64_t)st.st_blksize, (uint64_t)st.st_blksize
|
||||
);
|
||||
}
|
||||
}
|
||||
@@ -99,19 +99,19 @@ void disk_tool_simple_offsets(json11::Json cfg, bool json_output)
|
||||
if (device_block_size < 512 || device_block_size > 1048576 ||
|
||||
device_block_size & (device_block_size-1) != 0)
|
||||
{
|
||||
fprintf(stderr, "Invalid device block size specified: %lu\n", device_block_size);
|
||||
fprintf(stderr, "Invalid device block size specified: %ju\n", device_block_size);
|
||||
exit(1);
|
||||
}
|
||||
if (data_block_size < device_block_size || data_block_size > MAX_DATA_BLOCK_SIZE ||
|
||||
data_block_size & (data_block_size-1) != 0)
|
||||
{
|
||||
fprintf(stderr, "Invalid object size specified: %lu\n", data_block_size);
|
||||
fprintf(stderr, "Invalid object size specified: %ju\n", data_block_size);
|
||||
exit(1);
|
||||
}
|
||||
if (bitmap_granularity < device_block_size || bitmap_granularity > data_block_size ||
|
||||
bitmap_granularity & (bitmap_granularity-1) != 0)
|
||||
{
|
||||
fprintf(stderr, "Invalid bitmap granularity specified: %lu\n", bitmap_granularity);
|
||||
fprintf(stderr, "Invalid bitmap granularity specified: %ju\n", bitmap_granularity);
|
||||
exit(1);
|
||||
}
|
||||
if (csum_block_size && (data_block_size % csum_block_size))
|
||||
@@ -145,8 +145,8 @@ void disk_tool_simple_offsets(json11::Json cfg, bool json_output)
|
||||
{
|
||||
// Env
|
||||
printf(
|
||||
"meta_block_size=%lu\njournal_block_size=%lu\ndata_size=%lu\n"
|
||||
"data_device=%s\njournal_offset=%lu\nmeta_offset=%lu\ndata_offset=%lu\n",
|
||||
"meta_block_size=%ju\njournal_block_size=%ju\ndata_size=%ju\n"
|
||||
"data_device=%s\njournal_offset=%ju\nmeta_offset=%ju\ndata_offset=%ju\n",
|
||||
device_block_size, device_block_size, device_size-data_offset,
|
||||
device.c_str(), journal_offset, meta_offset, data_offset
|
||||
);
|
||||
@@ -160,14 +160,14 @@ void disk_tool_simple_offsets(json11::Json cfg, bool json_output)
|
||||
}
|
||||
if (device_block_size != 4096)
|
||||
{
|
||||
printf("--meta_block_size %lu\n--journal_block_size %lu\n", device_block_size, device_block_size);
|
||||
printf("--meta_block_size %ju\n--journal_block_size %ju\n", device_block_size, device_block_size);
|
||||
}
|
||||
if (orig_device_size)
|
||||
{
|
||||
printf("--data_size %lu\n", device_size-data_offset);
|
||||
printf("--data_size %ju\n", device_size-data_offset);
|
||||
}
|
||||
printf(
|
||||
"--data_device %s\n--journal_offset %lu\n--meta_offset %lu\n--data_offset %lu\n",
|
||||
"--data_device %s\n--journal_offset %ju\n--meta_offset %ju\n--data_offset %ju\n",
|
||||
device.c_str(), journal_offset, meta_offset, data_offset
|
||||
);
|
||||
}
|
||||
|
@@ -167,7 +167,7 @@ static const char *help_text =
|
||||
" Calculate offsets for old simple&stupid (no superblock) OSD deployment. Options:\n"
|
||||
" --object_size 128k Set blockstore block size\n"
|
||||
" --bitmap_granularity 4k Set bitmap granularity\n"
|
||||
" --journal_size 16M Set journal size\n"
|
||||
" --journal_size 32M Set journal size\n"
|
||||
" --data_csum_type none Set data checksum type (crc32c or none)\n"
|
||||
" --csum_block_size 4k Set data checksum block size\n"
|
||||
" --device_block_size 4k Set device block size\n"
|
||||
|
@@ -4,7 +4,7 @@
|
||||
#pragma once
|
||||
|
||||
#ifndef _LARGEFILE64_SOURCE
|
||||
#define _LARGEFILE64_SOURCE 1
|
||||
#define _LARGEFILE64_SOURCE
|
||||
#endif
|
||||
|
||||
#include <map>
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user