Compare commits
12 Commits
Author | SHA1 | Date |
---|---|---|
Vitaliy Filippov | 2ead06e126 | |
Vitaliy Filippov | a5d5559f8e | |
Vitaliy Filippov | e8e7ba8fde | |
Vitaliy Filippov | 6fd831a299 | |
Vitaliy Filippov | 069808dfce | |
Vitaliy Filippov | bcefa42bc0 | |
Vitaliy Filippov | 4636e02d43 | |
Vitaliy Filippov | e4c7d1c147 | |
Vitaliy Filippov | a4677f3e69 | |
Vitaliy Filippov | 7cbf207d65 | |
Vitaliy Filippov | 7c9711af20 | |
Vitaliy Filippov | 33ef701464 |
|
@ -13,7 +13,7 @@ Vitastor configuration consists of:
|
|||
- [Separate OSD settings](config/pool.en.md#osd-settings)
|
||||
- [Inode configuration](config/inode.en.md) i.e. image metadata like name, size and parent reference
|
||||
|
||||
Configuration parameters can be set in 3 places:
|
||||
Configuration parameters can be set in 4 places:
|
||||
- Configuration file (`/etc/vitastor/vitastor.conf` or other path)
|
||||
- etcd key `/vitastor/config/global`. Most variables can be set there, but etcd
|
||||
connection parameters should obviously be set in the configuration file.
|
||||
|
|
|
@ -14,7 +14,7 @@
|
|||
- [Настроек инодов](config/inode.ru.md), т.е. метаданных образов, таких, как имя, размер и ссылки на
|
||||
родительский образ
|
||||
|
||||
Параметры конфигурации могут задаваться в 3 местах:
|
||||
Параметры конфигурации могут задаваться в 4 местах:
|
||||
- Файле конфигурации (`/etc/vitastor/vitastor.conf` или по другому пути)
|
||||
- Ключе в etcd `/vitastor/config/global`. Большая часть параметров может
|
||||
задаваться там, кроме, естественно, самих параметров соединения с etcd,
|
||||
|
|
|
@ -14,6 +14,7 @@
|
|||
- Debian 12 (Bookworm/Sid): `deb https://vitastor.io/debian bookworm main`
|
||||
- Debian 11 (Bullseye): `deb https://vitastor.io/debian bullseye main`
|
||||
- Debian 10 (Buster): `deb https://vitastor.io/debian buster main`
|
||||
- Ubuntu 22.04 (Jammy): `deb https://vitastor.io/debian jammy main`
|
||||
- Add `-oldstable` to bookworm/bullseye/buster in this line to install the last
|
||||
stable version from 0.9.x branch instead of 1.x
|
||||
- Install packages: `apt update; apt install vitastor lp-solve etcd linux-image-amd64 qemu-system-x86`
|
||||
|
|
|
@ -14,6 +14,7 @@
|
|||
- Debian 12 (Bookworm/Sid): `deb https://vitastor.io/debian bookworm main`
|
||||
- Debian 11 (Bullseye): `deb https://vitastor.io/debian bullseye main`
|
||||
- Debian 10 (Buster): `deb https://vitastor.io/debian buster main`
|
||||
- Ubuntu 22.04 (Jammy): `deb https://vitastor.io/debian jammy main`
|
||||
- Добавьте `-oldstable` к слову bookworm/bullseye/buster в этой строке, чтобы
|
||||
установить последнюю стабильную версию из ветки 0.9.x вместо 1.x
|
||||
- Установите пакеты: `apt update; apt install vitastor lp-solve etcd linux-image-amd64 qemu-system-x86`
|
||||
|
|
|
@ -32,7 +32,7 @@
|
|||
|
||||
- SATA SSD: Micron 5100/5200/5300/5400, Samsung PM863/PM883/PM893, Intel D3-S4510/4520/4610/4620, Kingston DC500M
|
||||
- NVMe: Micron 9100/9200/9300/9400, Micron 7300/7450, Samsung PM983/PM9A3, Samsung PM1723/1735/1743,
|
||||
Intel DC-P3700/P4500/P4600, Intel D5-P4320, Intel D7-P5500/P5600, Intel Optane, Kingston DC1000B/DC1500M
|
||||
Intel DC-P3700/P4500/P4600, Intel D5-P4320/P5530, Intel D7-P5500/P5600, Intel Optane, Kingston DC1000B/DC1500M
|
||||
- HDD: HGST Ultrastar, Toshiba MG, Seagate EXOS
|
||||
|
||||
## Configure monitors
|
||||
|
|
|
@ -32,7 +32,7 @@
|
|||
|
||||
- SATA SSD: Micron 5100/5200/5300/5400, Samsung PM863/PM883/PM893, Intel D3-S4510/4520/4610/4620, Kingston DC500M
|
||||
- NVMe: Micron 9100/9200/9300/9400, Micron 7300/7450, Samsung PM983/PM9A3, Samsung PM1723/1735/1743,
|
||||
Intel DC-P3700/P4500/P4600, Intel D5-P4320, Intel D7-P5500/P5600, Intel Optane, Kingston DC1000B/DC1500M
|
||||
Intel DC-P3700/P4500/P4600, Intel D5-P4320/P5530, Intel D7-P5500/P5600, Intel Optane, Kingston DC1000B/DC1500M
|
||||
- HDD: HGST Ultrastar, Toshiba MG, Seagate EXOS
|
||||
|
||||
## Настройте мониторы
|
||||
|
|
|
@ -37,7 +37,7 @@ It supports the following commands:
|
|||
Global options:
|
||||
|
||||
```
|
||||
--config_file FILE Path to Vitastor configuration file
|
||||
--config_path FILE Path to Vitastor configuration file
|
||||
--etcd_address URL Etcd connection address
|
||||
--iodepth N Send N operations in parallel to each OSD when possible (default 32)
|
||||
--parallel_osds M Work with M osds in parallel when possible (default 4)
|
||||
|
|
|
@ -36,7 +36,7 @@ vitastor-cli - интерфейс командной строки для адм
|
|||
Глобальные опции:
|
||||
|
||||
```
|
||||
--config_file FILE Путь к файлу конфигурации Vitastor
|
||||
--config_path FILE Путь к файлу конфигурации Vitastor
|
||||
--etcd_address URL Адрес соединения с etcd
|
||||
--iodepth N Отправлять параллельно N операций на каждый OSD (по умолчанию 32)
|
||||
--parallel_osds M Работать параллельно с M OSD (по умолчанию 4)
|
||||
|
|
|
@ -36,7 +36,7 @@ It will output a block device name like /dev/nbd0 which you can then use as a no
|
|||
|
||||
You can also use `--pool <POOL> --inode <INODE> --size <SIZE>` instead of `--image <IMAGE>` if you want.
|
||||
|
||||
vitastor-nbd supports all usual Vitastor configuration options like `--config_file <path_to_config>` plus NBD-specific:
|
||||
vitastor-nbd supports all usual Vitastor configuration options like `--config_path <path_to_config>` plus NBD-specific:
|
||||
|
||||
* `--nbd_timeout 0` \
|
||||
Timeout for I/O operations in seconds after exceeding which the kernel stops the device.
|
||||
|
@ -54,7 +54,7 @@ vitastor-nbd supports all usual Vitastor configuration options like `--config_fi
|
|||
Stay in foreground, do not daemonize.
|
||||
|
||||
Note that `nbd_timeout`, `nbd_max_devices` and `nbd_max_part` options may also be specified
|
||||
in `/etc/vitastor/vitastor.conf` or in other configuration file specified with `--config_file`.
|
||||
in `/etc/vitastor/vitastor.conf` or in other configuration file specified with `--config_path`.
|
||||
|
||||
## unmap
|
||||
|
||||
|
|
|
@ -41,7 +41,7 @@ vitastor-nbd map [/dev/nbdN] --image testimg
|
|||
Для обращения по номеру инода, аналогично другим командам, можно использовать опции
|
||||
`--pool <POOL> --inode <INODE> --size <SIZE>` вместо `--image testimg`.
|
||||
|
||||
vitastor-nbd поддерживает все обычные опции Vitastor, например, `--config_file <path_to_config>`,
|
||||
vitastor-nbd поддерживает все обычные опции Vitastor, например, `--config_path <path_to_config>`,
|
||||
плюс специфичные для NBD:
|
||||
|
||||
* `--nbd_timeout 0` \
|
||||
|
@ -62,7 +62,7 @@ vitastor-nbd поддерживает все обычные опции Vitastor,
|
|||
|
||||
Обратите внимание, что опции `nbd_timeout`, `nbd_max_devices` и `nbd_max_part` можно
|
||||
также задавать в `/etc/vitastor/vitastor.conf` или в другом файле конфигурации,
|
||||
заданном опцией `--config_file`.
|
||||
заданном опцией `--config_path`.
|
||||
|
||||
## unmap
|
||||
|
||||
|
|
34
mon/mon.js
34
mon/mon.js
|
@ -773,23 +773,27 @@ class Mon
|
|||
}
|
||||
}
|
||||
}
|
||||
for (const pool_id in this.state.pool.stats)
|
||||
if (!this.recheck_pgs_active)
|
||||
{
|
||||
if (!seen_pools[pool_id])
|
||||
// PG recheck also modifies /pool/stats, so don't touch it here if it's active
|
||||
for (const pool_id in this.state.pool.stats)
|
||||
{
|
||||
txn.push({ requestDeleteRange: {
|
||||
key: b64(this.config.etcd_prefix+'/pool/stats/'+pool_id),
|
||||
} });
|
||||
delete this.state.pool.stats[pool_id];
|
||||
}
|
||||
else
|
||||
{
|
||||
const pool_stats = { ...this.state.pool.stats[pool_id] };
|
||||
serialize_bigints(pool_stats);
|
||||
txn.push({ requestPut: {
|
||||
key: b64(this.config.etcd_prefix+'/pool/stats/'+pool_id),
|
||||
value: b64(JSON.stringify(pool_stats)),
|
||||
} });
|
||||
if (!seen_pools[pool_id])
|
||||
{
|
||||
txn.push({ requestDeleteRange: {
|
||||
key: b64(this.config.etcd_prefix+'/pool/stats/'+pool_id),
|
||||
} });
|
||||
delete this.state.pool.stats[pool_id];
|
||||
}
|
||||
else
|
||||
{
|
||||
const pool_stats = { ...this.state.pool.stats[pool_id] };
|
||||
serialize_bigints(pool_stats);
|
||||
txn.push({ requestPut: {
|
||||
key: b64(this.config.etcd_prefix+'/pool/stats/'+pool_id),
|
||||
value: b64(JSON.stringify(pool_stats)),
|
||||
} });
|
||||
}
|
||||
}
|
||||
}
|
||||
if (txn.length)
|
||||
|
|
|
@ -9,7 +9,7 @@
|
|||
"author": "Vitaliy Filippov",
|
||||
"license": "UNLICENSED",
|
||||
"dependencies": {
|
||||
"antietcd": "^1.1.0",
|
||||
"antietcd": "^1.1.2",
|
||||
"sprintf-js": "^1.1.2",
|
||||
"ws": "^7.2.5"
|
||||
},
|
||||
|
|
|
@ -8,23 +8,9 @@ const LPOptimizer = require('./lp_optimizer/lp_optimizer.js');
|
|||
const { scale_pg_count } = require('./pg_utils.js');
|
||||
const { make_hier_tree, filter_osds_by_root_node,
|
||||
filter_osds_by_tags, filter_osds_by_block_layout, get_affinity_osds } = require('./osd_tree.js');
|
||||
const { select_murmur3 } = require('./lp_optimizer/murmur3.js');
|
||||
|
||||
let seed;
|
||||
|
||||
function reset_rng()
|
||||
{
|
||||
seed = 0x5f020e43;
|
||||
}
|
||||
|
||||
function rng()
|
||||
{
|
||||
seed ^= seed << 13;
|
||||
seed ^= seed >> 17;
|
||||
seed ^= seed << 5;
|
||||
return seed + 2147483648;
|
||||
}
|
||||
|
||||
function pick_primary(pool_config, osd_set, up_osds, aff_osds)
|
||||
function pick_primary(pool_id, pg_num, pool_config, osd_set, up_osds, aff_osds)
|
||||
{
|
||||
let alive_set;
|
||||
if (pool_config.scheme === 'replicated')
|
||||
|
@ -52,7 +38,7 @@ function pick_primary(pool_config, osd_set, up_osds, aff_osds)
|
|||
{
|
||||
return 0;
|
||||
}
|
||||
return alive_set[rng() % alive_set.length];
|
||||
return alive_set[select_murmur3(alive_set.length, osd_num => pool_id+'/'+pg_num+'/'+osd_num)];
|
||||
}
|
||||
|
||||
function recheck_primary(state, global_config, up_osds, osd_tree)
|
||||
|
@ -66,7 +52,6 @@ function recheck_primary(state, global_config, up_osds, osd_tree)
|
|||
continue;
|
||||
}
|
||||
const aff_osds = get_affinity_osds(pool_cfg, up_osds, osd_tree);
|
||||
reset_rng();
|
||||
for (let pg_num = 1; pg_num <= pool_cfg.pg_count; pg_num++)
|
||||
{
|
||||
if (!state.pg.config.items[pool_id])
|
||||
|
@ -76,7 +61,7 @@ function recheck_primary(state, global_config, up_osds, osd_tree)
|
|||
const pg_cfg = state.pg.config.items[pool_id][pg_num];
|
||||
if (pg_cfg)
|
||||
{
|
||||
const new_primary = pick_primary(state.config.pools[pool_id], pg_cfg.osd_set, up_osds, aff_osds);
|
||||
const new_primary = pick_primary(pool_id, pg_num, state.config.pools[pool_id], pg_cfg.osd_set, up_osds, aff_osds);
|
||||
if (pg_cfg.primary != new_primary)
|
||||
{
|
||||
if (!new_pg_config)
|
||||
|
@ -99,13 +84,12 @@ function save_new_pgs_txn(save_to, request, state, etcd_prefix, etcd_watch_revis
|
|||
{
|
||||
const aff_osds = get_affinity_osds(state.config.pools[pool_id] || {}, up_osds, osd_tree);
|
||||
const pg_items = {};
|
||||
reset_rng();
|
||||
new_pgs.map((osd_set, i) =>
|
||||
{
|
||||
osd_set = osd_set.map(osd_num => osd_num === LPOptimizer.NO_OSD ? 0 : osd_num);
|
||||
pg_items[i+1] = {
|
||||
osd_set,
|
||||
primary: pick_primary(state.config.pools[pool_id], osd_set, up_osds, aff_osds),
|
||||
primary: pick_primary(pool_id, i+1, state.config.pools[pool_id], osd_set, up_osds, aff_osds),
|
||||
};
|
||||
if (prev_pgs[i] && prev_pgs[i].join(' ') != osd_set.join(' ') &&
|
||||
prev_pgs[i].filter(osd_num => osd_num).length > 0)
|
||||
|
|
|
@ -16,6 +16,10 @@ NAN_MODULE_INIT(InitAddon)
|
|||
Nan::SetPrototypeMethod(tpl, "write", NodeVitastor::Write);
|
||||
Nan::SetPrototypeMethod(tpl, "sync", NodeVitastor::Sync);
|
||||
Nan::SetPrototypeMethod(tpl, "read_bitmap", NodeVitastor::ReadBitmap);
|
||||
Nan::SetPrototypeMethod(tpl, "on_ready", NodeVitastor::OnReady);
|
||||
Nan::SetPrototypeMethod(tpl, "get_min_io_size", NodeVitastor::GetMinIoSize);
|
||||
Nan::SetPrototypeMethod(tpl, "get_max_atomic_write_size", NodeVitastor::GetMaxAtomicWriteSize);
|
||||
Nan::SetPrototypeMethod(tpl, "get_immediate_commit", NodeVitastor::GetImmediateCommit);
|
||||
//Nan::SetPrototypeMethod(tpl, "destroy", NodeVitastor::Destroy);
|
||||
|
||||
Nan::Set(target, Nan::New("Client").ToLocalChecked(), Nan::GetFunction(tpl).ToLocalChecked());
|
||||
|
@ -63,6 +67,10 @@ NAN_MODULE_INIT(InitAddon)
|
|||
Nan::Set(target, Nan::New("ENOSYS").ToLocalChecked(), Nan::New<v8::Int32>(-ENOSYS));
|
||||
Nan::Set(target, Nan::New("EAGAIN").ToLocalChecked(), Nan::New<v8::Int32>(-EAGAIN));
|
||||
|
||||
Nan::Set(target, Nan::New("IMMEDIATE_NONE").ToLocalChecked(), Nan::New<v8::Int32>(IMMEDIATE_NONE));
|
||||
Nan::Set(target, Nan::New("IMMEDIATE_SMALL").ToLocalChecked(), Nan::New<v8::Int32>(IMMEDIATE_SMALL));
|
||||
Nan::Set(target, Nan::New("IMMEDIATE_ALL").ToLocalChecked(), Nan::New<v8::Int32>(IMMEDIATE_ALL));
|
||||
|
||||
// Listing handle
|
||||
|
||||
tpl = Nan::New<v8::FunctionTemplate>(NodeVitastorKVListing::Create);
|
||||
|
|
|
@ -267,6 +267,64 @@ static void on_error(NodeVitastorRequest *req, Nan::Callback & nanCallback, long
|
|||
nanCallback.Call(1, args, req);
|
||||
}
|
||||
|
||||
// on_ready(callback(err))
|
||||
NAN_METHOD(NodeVitastor::OnReady)
|
||||
{
|
||||
TRACE("NodeVitastor::OnReady");
|
||||
if (info.Length() < 1)
|
||||
Nan::ThrowError("Not enough arguments to on_ready(callback(err))");
|
||||
NodeVitastor* self = Nan::ObjectWrap::Unwrap<NodeVitastor>(info.This());
|
||||
v8::Local<v8::Function> callback = info[0].As<v8::Function>();
|
||||
auto req = new NodeVitastorRequest(self, callback);
|
||||
self->Ref();
|
||||
vitastor_c_on_ready(self->c, on_ready_finish, req);
|
||||
}
|
||||
|
||||
void NodeVitastor::on_ready_finish(void *opaque, long retval)
|
||||
{
|
||||
TRACE("NodeVitastor::on_ready_finish");
|
||||
auto req = (NodeVitastorRequest*)opaque;
|
||||
auto self = req->cli;
|
||||
Nan::HandleScope scope;
|
||||
Nan::Callback nanCallback(Nan::New(req->callback));
|
||||
nanCallback.Call(0, NULL, req);
|
||||
self->Unref();
|
||||
delete req;
|
||||
}
|
||||
|
||||
// get_min_io_size(pool_id)
|
||||
NAN_METHOD(NodeVitastor::GetMinIoSize)
|
||||
{
|
||||
TRACE("NodeVitastor::GetMinIoSize");
|
||||
if (info.Length() < 1)
|
||||
Nan::ThrowError("Not enough arguments to get_min_io_size(pool_id)");
|
||||
NodeVitastor* self = Nan::ObjectWrap::Unwrap<NodeVitastor>(info.This());
|
||||
uint64_t pool = get_ui64(info[0]);
|
||||
info.GetReturnValue().Set(Nan::New<v8::Number>(vitastor_c_inode_get_bitmap_granularity(self->c, INODE_WITH_POOL(pool, 1))));
|
||||
}
|
||||
|
||||
// get_max_atomic_write_size(pool_id)
|
||||
NAN_METHOD(NodeVitastor::GetMaxAtomicWriteSize)
|
||||
{
|
||||
TRACE("NodeVitastor::GetMaxAtomicWriteSize");
|
||||
if (info.Length() < 1)
|
||||
Nan::ThrowError("Not enough arguments to get_max_atomic_write_size(pool_id)");
|
||||
NodeVitastor* self = Nan::ObjectWrap::Unwrap<NodeVitastor>(info.This());
|
||||
uint64_t pool = get_ui64(info[0]);
|
||||
info.GetReturnValue().Set(Nan::New<v8::Number>(vitastor_c_inode_get_block_size(self->c, INODE_WITH_POOL(pool, 1))));
|
||||
}
|
||||
|
||||
// get_immediate_commit(pool_id)
|
||||
NAN_METHOD(NodeVitastor::GetImmediateCommit)
|
||||
{
|
||||
TRACE("NodeVitastor::GetImmediateCommit");
|
||||
if (info.Length() < 1)
|
||||
Nan::ThrowError("Not enough arguments to get_immediate_commit(pool_id)");
|
||||
NodeVitastor* self = Nan::ObjectWrap::Unwrap<NodeVitastor>(info.This());
|
||||
uint64_t pool = get_ui64(info[0]);
|
||||
info.GetReturnValue().Set(Nan::New<v8::Number>(vitastor_c_inode_get_immediate_commit(self->c, INODE_WITH_POOL(pool, 1))));
|
||||
}
|
||||
|
||||
void NodeVitastor::on_read_finish(void *opaque, long retval, uint64_t version)
|
||||
{
|
||||
TRACE("NodeVitastor::on_read_finish");
|
||||
|
|
|
@ -15,14 +15,22 @@ class NodeVitastor: public Nan::ObjectWrap
|
|||
public:
|
||||
// constructor({ ...config })
|
||||
static NAN_METHOD(Create);
|
||||
// read(pool, inode, offset, len, callback(err, buffer, version))
|
||||
// read(pool_id, inode, offset, len, callback(err, buffer, version))
|
||||
static NAN_METHOD(Read);
|
||||
// write(pool, inode, offset, buf: Buffer | Buffer[], { version }?, callback(err))
|
||||
// write(pool_id, inode, offset, buf: Buffer | Buffer[], { version }?, callback(err))
|
||||
static NAN_METHOD(Write);
|
||||
// sync(callback(err))
|
||||
static NAN_METHOD(Sync);
|
||||
// read_bitmap(pool, inode, offset, len, with_parents, callback(err, bitmap_buffer))
|
||||
// read_bitmap(pool_id, inode, offset, len, with_parents, callback(err, bitmap_buffer))
|
||||
static NAN_METHOD(ReadBitmap);
|
||||
// on_ready(callback(err))
|
||||
static NAN_METHOD(OnReady);
|
||||
// get_min_io_size(pool_id)
|
||||
static NAN_METHOD(GetMinIoSize);
|
||||
// get_max_atomic_write_size(pool_id)
|
||||
static NAN_METHOD(GetMaxAtomicWriteSize);
|
||||
// get_immediate_commit(pool_id)
|
||||
static NAN_METHOD(GetImmediateCommit);
|
||||
// // destroy()
|
||||
// static NAN_METHOD(Destroy);
|
||||
|
||||
|
@ -37,6 +45,7 @@ private:
|
|||
|
||||
static void on_io_readable(uv_poll_t* handle, int status, int revents);
|
||||
static void on_read_finish(void *opaque, long retval, uint64_t version);
|
||||
static void on_ready_finish(void *opaque, long retval);
|
||||
static void on_write_finish(void *opaque, long retval);
|
||||
static void on_read_bitmap_finish(void *opaque, long retval, uint8_t *bitmap);
|
||||
|
||||
|
|
|
@ -313,7 +313,7 @@ const char *help_text =
|
|||
#endif
|
||||
"Use vitastor-nbd --help <command> for command details or vitastor-nbd --help --all for all details.\n"
|
||||
"\n"
|
||||
"All usual Vitastor config options like --config_file <path_to_config> may also be specified in CLI.\n"
|
||||
"All usual Vitastor config options like --config_path <path_to_config> may also be specified in CLI.\n"
|
||||
;
|
||||
|
||||
class nbd_proxy
|
||||
|
|
|
@ -222,6 +222,14 @@ int vitastor_c_is_ready(vitastor_c *client)
|
|||
return client->cli->is_ready();
|
||||
}
|
||||
|
||||
void vitastor_c_on_ready(vitastor_c *client, VitastorIOHandler cb, void *opaque)
|
||||
{
|
||||
client->cli->on_ready([=]()
|
||||
{
|
||||
cb(opaque, 0);
|
||||
});
|
||||
}
|
||||
|
||||
void vitastor_c_uring_wait_ready(vitastor_c *client)
|
||||
{
|
||||
while (!client->cli->is_ready())
|
||||
|
|
|
@ -51,6 +51,7 @@ vitastor_c *vitastor_c_create_epoll_json(const char **options, int options_len);
|
|||
void* vitastor_c_get_internal_client(vitastor_c *client);
|
||||
void vitastor_c_destroy(vitastor_c *client);
|
||||
int vitastor_c_is_ready(vitastor_c *client);
|
||||
void vitastor_c_on_ready(vitastor_c *client, VitastorIOHandler cb, void *opaque);
|
||||
int vitastor_c_uring_register_eventfd(vitastor_c *client);
|
||||
void vitastor_c_uring_wait_ready(vitastor_c *client);
|
||||
void vitastor_c_uring_handle_events(vitastor_c *client);
|
||||
|
|
|
@ -224,7 +224,7 @@ static const char* help_text =
|
|||
"Use vitastor-cli --help <command> for command details or vitastor-cli --help --all for all details.\n"
|
||||
"\n"
|
||||
"GLOBAL OPTIONS:\n"
|
||||
" --config_file FILE Path to Vitastor configuration file\n"
|
||||
" --config_path FILE Path to Vitastor configuration file\n"
|
||||
" --etcd_address URL Etcd connection address\n"
|
||||
" --iodepth N Send N operations in parallel to each OSD when possible (default 32)\n"
|
||||
" --parallel_osds M Work with M osds in parallel when possible (default 4)\n"
|
||||
|
|
|
@ -281,6 +281,8 @@ class osd_t
|
|||
int pick_next_scrub(object_id & next_oid);
|
||||
void submit_scrub_op(object_id oid);
|
||||
bool continue_scrub();
|
||||
void submit_scrub_subops(osd_op_t *cur_op);
|
||||
void scrub_check_results(osd_op_t *cur_op);
|
||||
void plan_scrub(pg_t & pg, bool report_state = true);
|
||||
void schedule_scrub(pg_t & pg);
|
||||
|
||||
|
@ -313,7 +315,7 @@ class osd_t
|
|||
pg_osd_set_state_t *mark_object(pg_t & pg, object_id oid, pg_osd_set_state_t *prev_object_state, bool ref,
|
||||
std::function<int(pg_osd_set_t & new_set)> calc_set);
|
||||
pg_osd_set_state_t *mark_object_corrupted(pg_t & pg, object_id oid, pg_osd_set_state_t *prev_object_state,
|
||||
osd_rmw_stripe_t *stripes, bool ref, bool inconsistent);
|
||||
osd_rmw_stripe_t *stripes, bool ref);
|
||||
pg_osd_set_state_t *mark_partial_write(pg_t & pg, object_id oid, pg_osd_set_state_t *prev_object_state,
|
||||
osd_rmw_stripe_t *stripes, bool ref);
|
||||
void deref_object_state(pg_t & pg, pg_osd_set_state_t **object_state, bool deref);
|
||||
|
@ -326,6 +328,8 @@ class osd_t
|
|||
void submit_primary_subops(int submit_type, uint64_t op_version, const uint64_t* osd_set, osd_op_t *cur_op);
|
||||
int submit_primary_subop_batch(int submit_type, inode_t inode, uint64_t op_version,
|
||||
osd_rmw_stripe_t *stripes, const uint64_t* osd_set, osd_op_t *cur_op, int subop_idx, int zero_read);
|
||||
void submit_primary_subop(osd_op_t *cur_op, osd_op_t *subop,
|
||||
osd_rmw_stripe_t *si, bool wr, inode_t inode, uint64_t op_version);
|
||||
void submit_primary_del_subops(osd_op_t *cur_op, uint64_t *cur_set, uint64_t set_size, pg_osd_set_t & loc_set);
|
||||
void submit_primary_del_batch(osd_op_t *cur_op, obj_ver_osd_t *chunks_to_delete, int chunks_to_delete_count);
|
||||
int submit_primary_sync_subops(osd_op_t *cur_op);
|
||||
|
|
|
@ -28,7 +28,6 @@ bool osd_t::prepare_primary_rw(osd_op_t *cur_op)
|
|||
return false;
|
||||
}
|
||||
auto & pool_cfg = pool_cfg_it->second;
|
||||
// FIXME: op_data->pg_data_size can probably be removed (there's pg.pg_data_size)
|
||||
uint64_t pg_data_size = (pool_cfg.scheme == POOL_SCHEME_REPLICATED ? 1 : pool_cfg.pg_size-pool_cfg.parity_chunks);
|
||||
uint64_t pg_block_size = bs_block_size * pg_data_size;
|
||||
object_id oid = {
|
||||
|
@ -52,9 +51,8 @@ bool osd_t::prepare_primary_rw(osd_op_t *cur_op)
|
|||
finish_op(cur_op, -EINVAL);
|
||||
return false;
|
||||
}
|
||||
// Scrub is similar to r/w, so it's also handled here
|
||||
int stripe_count = (pool_cfg.scheme == POOL_SCHEME_REPLICATED
|
||||
&& cur_op->req.hdr.opcode != OSD_OP_SCRUB ? 1 : pg_it->second.pg_size);
|
||||
int stripe_count = (cur_op->req.hdr.opcode == OSD_OP_SCRUB ? 0 :
|
||||
(pool_cfg.scheme == POOL_SCHEME_REPLICATED ? 1 : pg_it->second.pg_size));
|
||||
int chain_size = 0;
|
||||
if (cur_op->req.hdr.opcode == OSD_OP_READ && cur_op->req.rw.meta_revision > 0)
|
||||
{
|
||||
|
@ -110,20 +108,22 @@ bool osd_t::prepare_primary_rw(osd_op_t *cur_op)
|
|||
);
|
||||
void *data_buf = (uint8_t*)op_data + sizeof(osd_primary_op_data_t);
|
||||
op_data->pg_num = pg_num;
|
||||
op_data->pg = &pg_it->second;
|
||||
op_data->oid = oid;
|
||||
op_data->stripes = (osd_rmw_stripe_t*)data_buf;
|
||||
op_data->stripe_count = stripe_count;
|
||||
data_buf = (uint8_t*)data_buf + sizeof(osd_rmw_stripe_t) * stripe_count;
|
||||
op_data->scheme = pool_cfg.scheme;
|
||||
op_data->pg_data_size = pg_data_size;
|
||||
op_data->pg_size = pg_it->second.pg_size;
|
||||
cur_op->op_data = op_data;
|
||||
split_stripes(pg_data_size, bs_block_size, (uint32_t)(cur_op->req.rw.offset - oid.stripe), cur_op->req.rw.len, op_data->stripes);
|
||||
// Resulting bitmaps have to survive op_data and be freed with the op itself
|
||||
assert(!cur_op->bitmap_buf);
|
||||
cur_op->bitmap_buf = calloc_or_die(1, clean_entry_bitmap_size * stripe_count);
|
||||
for (int i = 0; i < stripe_count; i++)
|
||||
if (cur_op->req.hdr.opcode != OSD_OP_SCRUB)
|
||||
{
|
||||
op_data->stripes[i].bmp_buf = (uint8_t*)cur_op->bitmap_buf + clean_entry_bitmap_size * i;
|
||||
split_stripes(pg_data_size, bs_block_size, (uint32_t)(cur_op->req.rw.offset - oid.stripe), cur_op->req.rw.len, op_data->stripes);
|
||||
// Resulting bitmaps have to survive op_data and be freed with the op itself
|
||||
assert(!cur_op->bitmap_buf);
|
||||
cur_op->bitmap_buf = calloc_or_die(1, clean_entry_bitmap_size * stripe_count);
|
||||
for (int i = 0; i < stripe_count; i++)
|
||||
{
|
||||
op_data->stripes[i].bmp_buf = (uint8_t*)cur_op->bitmap_buf + clean_entry_bitmap_size * i;
|
||||
}
|
||||
}
|
||||
op_data->chain_size = chain_size;
|
||||
if (chain_size > 0)
|
||||
|
@ -205,11 +205,11 @@ void osd_t::continue_primary_read(osd_op_t *cur_op)
|
|||
resume_0:
|
||||
cur_op->reply.rw.bitmap_len = 0;
|
||||
{
|
||||
auto & pg = pgs.at({ .pool_id = INODE_POOL(op_data->oid.inode), .pg_num = op_data->pg_num });
|
||||
auto & pg = *op_data->pg;
|
||||
if (cur_op->req.rw.len == 0)
|
||||
{
|
||||
// len=0 => bitmap read
|
||||
for (int role = 0; role < op_data->pg_data_size; role++)
|
||||
for (int role = 0; role < pg.pg_data_size; role++)
|
||||
{
|
||||
op_data->stripes[role].read_start = 0;
|
||||
op_data->stripes[role].read_end = UINT32_MAX;
|
||||
|
@ -217,7 +217,7 @@ resume_0:
|
|||
}
|
||||
else
|
||||
{
|
||||
for (int role = 0; role < op_data->pg_data_size; role++)
|
||||
for (int role = 0; role < pg.pg_data_size; role++)
|
||||
{
|
||||
op_data->stripes[role].read_start = op_data->stripes[role].req_start;
|
||||
op_data->stripes[role].read_end = op_data->stripes[role].req_end;
|
||||
|
@ -228,29 +228,27 @@ resume_0:
|
|||
op_data->target_ver = vo_it != pg.ver_override.end() ? vo_it->second : UINT64_MAX;
|
||||
// PG may have degraded or misplaced objects
|
||||
op_data->prev_set = get_object_osd_set(pg, op_data->oid, &op_data->object_state);
|
||||
if (pg.state == PG_ACTIVE || op_data->scheme == POOL_SCHEME_REPLICATED)
|
||||
if (pg.state == PG_ACTIVE || pg.scheme == POOL_SCHEME_REPLICATED)
|
||||
{
|
||||
// Fast happy-path
|
||||
if (op_data->scheme == POOL_SCHEME_REPLICATED &&
|
||||
if (pg.scheme == POOL_SCHEME_REPLICATED &&
|
||||
op_data->object_state && (op_data->object_state->state & OBJ_INCOMPLETE))
|
||||
{
|
||||
finish_op(cur_op, -EIO);
|
||||
return;
|
||||
}
|
||||
cur_op->buf = alloc_read_buffer(op_data->stripes, op_data->pg_data_size, 0);
|
||||
cur_op->buf = alloc_read_buffer(op_data->stripes, pg.pg_data_size, 0);
|
||||
submit_primary_subops(SUBMIT_RMW_READ, op_data->target_ver, op_data->prev_set, cur_op);
|
||||
op_data->st = 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
if (extend_missing_stripes(op_data->stripes, op_data->prev_set, op_data->pg_data_size, pg.pg_size) < 0)
|
||||
if (extend_missing_stripes(op_data->stripes, op_data->prev_set, pg.pg_data_size, pg.pg_size) < 0)
|
||||
{
|
||||
finish_op(cur_op, -EIO);
|
||||
return;
|
||||
}
|
||||
// Submit reads
|
||||
op_data->pg_size = pg.pg_size;
|
||||
op_data->scheme = pg.scheme;
|
||||
op_data->degraded = 1;
|
||||
cur_op->buf = alloc_read_buffer(op_data->stripes, pg.pg_size, 0);
|
||||
submit_primary_subops(SUBMIT_RMW_READ, op_data->target_ver, op_data->prev_set, cur_op);
|
||||
|
@ -265,30 +263,29 @@ resume_2:
|
|||
if (op_data->errcode == -EIO || op_data->errcode == -EDOM)
|
||||
{
|
||||
// I/O or checksum error
|
||||
auto & pg = pgs.at({ .pool_id = INODE_POOL(op_data->oid.inode), .pg_num = op_data->pg_num });
|
||||
// FIXME: ref = true ideally... because new_state != state is not necessarily true if it's freed and recreated
|
||||
op_data->object_state = mark_object_corrupted(pg, op_data->oid, op_data->object_state, op_data->stripes, false, false);
|
||||
op_data->object_state = mark_object_corrupted(*op_data->pg, op_data->oid, op_data->object_state, op_data->stripes, false);
|
||||
goto resume_0;
|
||||
}
|
||||
finish_op(cur_op, op_data->errcode);
|
||||
return;
|
||||
}
|
||||
cur_op->reply.rw.version = op_data->fact_ver;
|
||||
cur_op->reply.rw.bitmap_len = op_data->pg_data_size * clean_entry_bitmap_size;
|
||||
cur_op->reply.rw.bitmap_len = op_data->pg->pg_data_size * clean_entry_bitmap_size;
|
||||
if (op_data->degraded)
|
||||
{
|
||||
// Reconstruct missing stripes
|
||||
osd_rmw_stripe_t *stripes = op_data->stripes;
|
||||
if (op_data->scheme == POOL_SCHEME_XOR)
|
||||
if (op_data->pg->scheme == POOL_SCHEME_XOR)
|
||||
{
|
||||
reconstruct_stripes_xor(stripes, op_data->pg_size, clean_entry_bitmap_size);
|
||||
reconstruct_stripes_xor(stripes, op_data->pg->pg_size, clean_entry_bitmap_size);
|
||||
}
|
||||
else if (op_data->scheme == POOL_SCHEME_EC)
|
||||
else if (op_data->pg->scheme == POOL_SCHEME_EC)
|
||||
{
|
||||
reconstruct_stripes_ec(stripes, op_data->pg_size, op_data->pg_data_size, clean_entry_bitmap_size);
|
||||
reconstruct_stripes_ec(stripes, op_data->pg->pg_size, op_data->pg->pg_data_size, clean_entry_bitmap_size);
|
||||
}
|
||||
cur_op->iov.push_back(op_data->stripes[0].bmp_buf, cur_op->reply.rw.bitmap_len);
|
||||
for (int role = 0; role < op_data->pg_size; role++)
|
||||
for (int role = 0; role < op_data->pg->pg_size; role++)
|
||||
{
|
||||
if (stripes[role].req_end != 0)
|
||||
{
|
||||
|
@ -360,10 +357,10 @@ pg_osd_set_state_t *osd_t::mark_object(pg_t & pg, object_id oid, pg_osd_set_stat
|
|||
return object_state;
|
||||
}
|
||||
|
||||
pg_osd_set_state_t *osd_t::mark_object_corrupted(pg_t & pg, object_id oid, pg_osd_set_state_t *prev_object_state,
|
||||
osd_rmw_stripe_t *stripes, bool ref, bool inconsistent)
|
||||
pg_osd_set_state_t *osd_t::mark_object_corrupted(pg_t & pg, object_id oid,
|
||||
pg_osd_set_state_t *prev_object_state, osd_rmw_stripe_t *stripes, bool ref)
|
||||
{
|
||||
return mark_object(pg, oid, prev_object_state, ref, [stripes, inconsistent](pg_osd_set_t & new_set)
|
||||
return mark_object(pg, oid, prev_object_state, ref, [stripes](pg_osd_set_t & new_set)
|
||||
{
|
||||
// Mark object chunk(s) as corrupted
|
||||
int changes = 0;
|
||||
|
@ -390,16 +387,6 @@ pg_osd_set_state_t *osd_t::mark_object_corrupted(pg_t & pg, object_id oid, pg_os
|
|||
chunk.loc_bad &= ~LOC_CORRUPTED;
|
||||
}
|
||||
}
|
||||
if (inconsistent && !chunk.loc_bad)
|
||||
{
|
||||
changes++;
|
||||
chunk.loc_bad |= LOC_INCONSISTENT;
|
||||
}
|
||||
else if (!inconsistent && (chunk.loc_bad & LOC_INCONSISTENT))
|
||||
{
|
||||
changes++;
|
||||
chunk.loc_bad &= ~LOC_INCONSISTENT;
|
||||
}
|
||||
chunk_it++;
|
||||
}
|
||||
return changes;
|
||||
|
@ -695,7 +682,7 @@ void osd_t::continue_primary_del(osd_op_t *cur_op)
|
|||
return;
|
||||
}
|
||||
osd_primary_op_data_t *op_data = cur_op->op_data;
|
||||
auto & pg = pgs.at({ .pool_id = INODE_POOL(op_data->oid.inode), .pg_num = op_data->pg_num });
|
||||
auto & pg = *op_data->pg;
|
||||
if (op_data->st == 1) goto resume_1;
|
||||
else if (op_data->st == 2) goto resume_2;
|
||||
else if (op_data->st == 3) goto resume_3;
|
||||
|
|
|
@ -20,14 +20,15 @@ struct unstable_osd_num_t
|
|||
struct osd_primary_op_data_t
|
||||
{
|
||||
int st = 0;
|
||||
pg_num_t pg_num;
|
||||
object_id oid;
|
||||
uint64_t target_ver;
|
||||
pg_num_t pg_num = 0;
|
||||
object_id oid = {};
|
||||
uint64_t target_ver = 0;
|
||||
uint64_t orig_ver = 0, fact_ver = 0;
|
||||
uint64_t scheme = 0;
|
||||
int n_subops = 0, done = 0, errors = 0, drops = 0, errcode = 0;
|
||||
int degraded = 0, pg_size, pg_data_size;
|
||||
osd_rmw_stripe_t *stripes;
|
||||
int degraded = 0;
|
||||
int stripe_count = 0;
|
||||
osd_rmw_stripe_t *stripes = NULL;
|
||||
pg_t *pg = NULL;
|
||||
osd_op_t *subops = NULL;
|
||||
uint64_t *prev_set = NULL;
|
||||
pg_osd_set_state_t *object_state = NULL;
|
||||
|
|
|
@ -7,7 +7,7 @@
|
|||
void osd_t::continue_chained_read(osd_op_t *cur_op)
|
||||
{
|
||||
osd_primary_op_data_t *op_data = cur_op->op_data;
|
||||
auto & pg = pgs.at({ .pool_id = INODE_POOL(op_data->oid.inode), .pg_num = op_data->pg_num });
|
||||
auto & pg = *op_data->pg;
|
||||
if (op_data->st == 1)
|
||||
goto resume_1;
|
||||
else if (op_data->st == 2)
|
||||
|
@ -17,7 +17,7 @@ void osd_t::continue_chained_read(osd_op_t *cur_op)
|
|||
else if (op_data->st == 4)
|
||||
goto resume_4;
|
||||
cur_op->reply.rw.bitmap_len = 0;
|
||||
for (int role = 0; role < op_data->pg_data_size; role++)
|
||||
for (int role = 0; role < pg.pg_data_size; role++)
|
||||
{
|
||||
op_data->stripes[role].read_start = op_data->stripes[role].req_start;
|
||||
op_data->stripes[role].read_end = op_data->stripes[role].req_end;
|
||||
|
@ -329,7 +329,7 @@ std::vector<osd_chain_read_t> osd_t::collect_chained_read_requests(osd_op_t *cur
|
|||
{
|
||||
osd_primary_op_data_t *op_data = cur_op->op_data;
|
||||
std::vector<osd_chain_read_t> chain_reads;
|
||||
int stripe_count = (op_data->scheme == POOL_SCHEME_REPLICATED ? 1 : op_data->pg_size);
|
||||
int stripe_count = (op_data->pg->scheme == POOL_SCHEME_REPLICATED ? 1 : op_data->pg->pg_size);
|
||||
memset(op_data->stripes[0].bmp_buf, 0, stripe_count * clean_entry_bitmap_size);
|
||||
uint8_t *global_bitmap = (uint8_t*)op_data->stripes[0].bmp_buf;
|
||||
// We always use at most 1 read request per layer
|
||||
|
@ -337,7 +337,7 @@ std::vector<osd_chain_read_t> osd_t::collect_chained_read_requests(osd_op_t *cur
|
|||
{
|
||||
uint8_t *part_bitmap = ((uint8_t*)op_data->snapshot_bitmaps) + chain_pos*stripe_count*clean_entry_bitmap_size;
|
||||
int start = !cur_op->req.rw.len ? 0 : (cur_op->req.rw.offset - op_data->oid.stripe)/bs_bitmap_granularity;
|
||||
int end = !cur_op->req.rw.len ? op_data->pg_data_size*clean_entry_bitmap_size*8 : start + cur_op->req.rw.len/bs_bitmap_granularity;
|
||||
int end = !cur_op->req.rw.len ? op_data->pg->pg_data_size*clean_entry_bitmap_size*8 : start + cur_op->req.rw.len/bs_bitmap_granularity;
|
||||
// Skip unneeded part in the beginning
|
||||
while (start < end && (
|
||||
((global_bitmap[start>>3] >> (start&7)) & 1) ||
|
||||
|
@ -410,11 +410,11 @@ int osd_t::submit_chained_read_requests(pg_t & pg, osd_op_t *cur_op)
|
|||
// FIXME: maybe introduce split_read_stripes to shorten these lines and to remove read_start=req_start
|
||||
osd_rmw_stripe_t *stripes = chain_stripes + chain_reads[cri].chain_pos*stripe_count;
|
||||
split_stripes(pg.pg_data_size, bs_block_size, chain_reads[cri].offset, chain_reads[cri].len, stripes);
|
||||
if (op_data->scheme == POOL_SCHEME_REPLICATED && !stripes[0].req_end)
|
||||
if (pg.scheme == POOL_SCHEME_REPLICATED && !stripes[0].req_end)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
for (int role = 0; role < op_data->pg_data_size; role++)
|
||||
for (int role = 0; role < pg.pg_data_size; role++)
|
||||
{
|
||||
stripes[role].read_start = stripes[role].req_start;
|
||||
stripes[role].read_end = stripes[role].req_end;
|
||||
|
@ -423,7 +423,7 @@ int osd_t::submit_chained_read_requests(pg_t & pg, osd_op_t *cur_op)
|
|||
if (pg.state != PG_ACTIVE)
|
||||
{
|
||||
cur_set = get_object_osd_set(pg, cur_oid, &op_data->chain_states[chain_reads[cri].chain_pos]);
|
||||
if (op_data->scheme != POOL_SCHEME_REPLICATED)
|
||||
if (pg.scheme != POOL_SCHEME_REPLICATED)
|
||||
{
|
||||
if (extend_missing_stripes(stripes, cur_set, pg.pg_data_size, pg.pg_size) < 0)
|
||||
{
|
||||
|
@ -446,7 +446,7 @@ int osd_t::submit_chained_read_requests(pg_t & pg, osd_op_t *cur_op)
|
|||
}
|
||||
}
|
||||
}
|
||||
if (op_data->scheme == POOL_SCHEME_REPLICATED)
|
||||
if (pg.scheme == POOL_SCHEME_REPLICATED)
|
||||
{
|
||||
n_subops++;
|
||||
read_buffer_size += stripes[0].read_end - stripes[0].read_start;
|
||||
|
@ -491,7 +491,7 @@ int osd_t::submit_chained_read_requests(pg_t & pg, osd_op_t *cur_op)
|
|||
for (int cri = 0; cri < chain_reads.size(); cri++)
|
||||
{
|
||||
osd_rmw_stripe_t *stripes = chain_stripes + chain_reads[cri].chain_pos*stripe_count;
|
||||
if (op_data->scheme == POOL_SCHEME_REPLICATED && !stripes[0].req_end)
|
||||
if (pg.scheme == POOL_SCHEME_REPLICATED && !stripes[0].req_end)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
@ -501,9 +501,9 @@ int osd_t::submit_chained_read_requests(pg_t & pg, osd_op_t *cur_op)
|
|||
auto cur_state = op_data->chain_states[chain_reads[cri].chain_pos];
|
||||
uint64_t *cur_set = (pg.state != PG_ACTIVE && cur_state ? cur_state->read_target.data() : pg.cur_set.data());
|
||||
int zero_read = -1;
|
||||
if (op_data->scheme == POOL_SCHEME_REPLICATED)
|
||||
if (pg.scheme == POOL_SCHEME_REPLICATED)
|
||||
{
|
||||
for (int role = 0; role < op_data->pg_size; role++)
|
||||
for (int role = 0; role < pg.pg_size; role++)
|
||||
if (cur_set[role] == this->osd_num || zero_read == -1)
|
||||
zero_read = role;
|
||||
}
|
||||
|
@ -535,7 +535,7 @@ void osd_t::check_corrupted_chained(pg_t & pg, osd_op_t *cur_op)
|
|||
}
|
||||
if (corrupted)
|
||||
{
|
||||
mark_object_corrupted(pg, cur_oid, op_data->chain_states[op_data->chain_reads[cri].chain_pos], stripes, false, false);
|
||||
mark_object_corrupted(pg, cur_oid, op_data->chain_states[op_data->chain_reads[cri].chain_pos], stripes, false);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -555,18 +555,18 @@ void osd_t::send_chained_read_results(pg_t & pg, osd_op_t *cur_op)
|
|||
{
|
||||
// Reconstruct missing stripes
|
||||
osd_rmw_stripe_t *stripes = chain_stripes + op_data->chain_reads[cri].chain_pos*stripe_count;
|
||||
if (op_data->scheme == POOL_SCHEME_XOR)
|
||||
if (pg.scheme == POOL_SCHEME_XOR)
|
||||
{
|
||||
reconstruct_stripes_xor(stripes, pg.pg_size, clean_entry_bitmap_size);
|
||||
}
|
||||
else if (op_data->scheme == POOL_SCHEME_EC)
|
||||
else if (pg.scheme == POOL_SCHEME_EC)
|
||||
{
|
||||
reconstruct_stripes_ec(stripes, pg.pg_size, pg.pg_data_size, clean_entry_bitmap_size);
|
||||
}
|
||||
}
|
||||
}
|
||||
// Send bitmap
|
||||
cur_op->reply.rw.bitmap_len = op_data->pg_data_size * clean_entry_bitmap_size;
|
||||
cur_op->reply.rw.bitmap_len = pg.pg_data_size * clean_entry_bitmap_size;
|
||||
cur_op->iov.push_back(op_data->stripes[0].bmp_buf, cur_op->reply.rw.bitmap_len);
|
||||
// And finally compose the result
|
||||
uint64_t sent = 0;
|
||||
|
|
|
@ -67,7 +67,7 @@ void osd_t::finish_op(osd_op_t *cur_op, int retval)
|
|||
if (cur_op->req.hdr.opcode == OSD_OP_DELETE)
|
||||
{
|
||||
if (cur_op->op_data)
|
||||
inode_stats[cur_op->req.rw.inode].op_bytes[inode_st_op] += cur_op->op_data->pg_data_size * bs_block_size;
|
||||
inode_stats[cur_op->req.rw.inode].op_bytes[inode_st_op] += cur_op->op_data->pg->pg_data_size * bs_block_size;
|
||||
}
|
||||
else
|
||||
inode_stats[cur_op->req.rw.inode].op_bytes[inode_st_op] += cur_op->req.rw.len;
|
||||
|
@ -76,7 +76,7 @@ void osd_t::finish_op(osd_op_t *cur_op, int retval)
|
|||
{
|
||||
if (cur_op->op_data->pg_num > 0)
|
||||
{
|
||||
auto & pg = pgs.at({ .pool_id = INODE_POOL(cur_op->op_data->oid.inode), .pg_num = cur_op->op_data->pg_num });
|
||||
auto & pg = *cur_op->op_data->pg;
|
||||
pg.inflight--;
|
||||
assert(pg.inflight >= 0);
|
||||
if ((pg.state & PG_STOPPING) && pg.inflight == 0 && !pg.flush_batch)
|
||||
|
@ -126,10 +126,10 @@ void osd_t::submit_primary_subops(int submit_type, uint64_t op_version, const ui
|
|||
bool wr = submit_type == SUBMIT_WRITE;
|
||||
osd_primary_op_data_t *op_data = cur_op->op_data;
|
||||
osd_rmw_stripe_t *stripes = op_data->stripes;
|
||||
bool rep = op_data->scheme == POOL_SCHEME_REPLICATED;
|
||||
bool rep = op_data->pg->scheme == POOL_SCHEME_REPLICATED;
|
||||
// Allocate subops
|
||||
int n_subops = 0, zero_read = -1;
|
||||
for (int role = 0; role < op_data->pg_size; role++)
|
||||
for (int role = 0; role < op_data->pg->pg_size; role++)
|
||||
{
|
||||
if (osd_set[role] == this->osd_num || osd_set[role] != 0 && zero_read == -1)
|
||||
zero_read = role;
|
||||
|
@ -152,11 +152,11 @@ void osd_t::submit_primary_subops(int submit_type, uint64_t op_version, const ui
|
|||
int osd_t::submit_primary_subop_batch(int submit_type, inode_t inode, uint64_t op_version,
|
||||
osd_rmw_stripe_t *stripes, const uint64_t* osd_set, osd_op_t *cur_op, int subop_idx, int zero_read)
|
||||
{
|
||||
bool rep = cur_op->op_data->pg->scheme == POOL_SCHEME_REPLICATED;
|
||||
bool wr = submit_type == SUBMIT_WRITE;
|
||||
osd_primary_op_data_t *op_data = cur_op->op_data;
|
||||
bool rep = op_data->scheme == POOL_SCHEME_REPLICATED;
|
||||
int i = subop_idx;
|
||||
for (int role = 0; role < op_data->pg_size; role++)
|
||||
for (int role = 0; role < op_data->pg->pg_size; role++)
|
||||
{
|
||||
// We always submit zero-length writes to all replicas, even if the stripe is not modified
|
||||
if (!(wr || !rep && stripes[role].read_end != 0 || zero_read == role || submit_type == SUBMIT_SCRUB_READ))
|
||||
|
@ -168,109 +168,9 @@ int osd_t::submit_primary_subop_batch(int submit_type, inode_t inode, uint64_t o
|
|||
osd_rmw_stripe_t *si = stripes + (submit_type == SUBMIT_SCRUB_READ ? role : stripe_num);
|
||||
if (role_osd_num != 0)
|
||||
{
|
||||
osd_op_t *subop = op_data->subops + i;
|
||||
uint32_t subop_len = wr
|
||||
? si->write_end - si->write_start
|
||||
: si->read_end - si->read_start;
|
||||
if (!wr && si->read_end == UINT32_MAX)
|
||||
{
|
||||
subop_len = 0;
|
||||
}
|
||||
si->osd_num = role_osd_num;
|
||||
si->read_error = false;
|
||||
subop->bitmap = si->bmp_buf;
|
||||
subop->bitmap_len = clean_entry_bitmap_size;
|
||||
// Using rmw_buf to pass pointer to stripes. Dirty but should work
|
||||
subop->rmw_buf = si;
|
||||
if (role_osd_num == this->osd_num)
|
||||
{
|
||||
clock_gettime(CLOCK_REALTIME, &subop->tv_begin);
|
||||
subop->op_type = (uint64_t)cur_op;
|
||||
subop->bs_op = new blockstore_op_t((blockstore_op_t){
|
||||
.opcode = (uint64_t)(wr ? (rep ? BS_OP_WRITE_STABLE : BS_OP_WRITE) : BS_OP_READ),
|
||||
.callback = [subop, this](blockstore_op_t *bs_subop)
|
||||
{
|
||||
handle_primary_bs_subop(subop);
|
||||
},
|
||||
{ {
|
||||
.oid = (object_id){
|
||||
.inode = inode,
|
||||
.stripe = op_data->oid.stripe | stripe_num,
|
||||
},
|
||||
.version = op_version,
|
||||
.offset = wr ? si->write_start : si->read_start,
|
||||
.len = subop_len,
|
||||
} },
|
||||
.buf = wr ? si->write_buf : si->read_buf,
|
||||
.bitmap = si->bmp_buf,
|
||||
});
|
||||
#ifdef OSD_DEBUG
|
||||
printf(
|
||||
"Submit %s to local: %jx:%jx v%ju %u-%u\n", wr ? "write" : "read",
|
||||
inode, op_data->oid.stripe | stripe_num, op_version,
|
||||
subop->bs_op->offset, subop->bs_op->len
|
||||
);
|
||||
#endif
|
||||
bs->enqueue_op(subop->bs_op);
|
||||
}
|
||||
else
|
||||
{
|
||||
subop->op_type = OSD_OP_OUT;
|
||||
subop->req.sec_rw = {
|
||||
.header = {
|
||||
.magic = SECONDARY_OSD_OP_MAGIC,
|
||||
.id = msgr.next_subop_id++,
|
||||
.opcode = (uint64_t)(wr ? (rep ? OSD_OP_SEC_WRITE_STABLE : OSD_OP_SEC_WRITE) : OSD_OP_SEC_READ),
|
||||
},
|
||||
.oid = {
|
||||
.inode = inode,
|
||||
.stripe = op_data->oid.stripe | stripe_num,
|
||||
},
|
||||
.version = op_version,
|
||||
.offset = wr ? si->write_start : si->read_start,
|
||||
.len = subop_len,
|
||||
.attr_len = wr ? clean_entry_bitmap_size : 0,
|
||||
.flags = cur_op->peer_fd == SELF_FD && cur_op->req.hdr.opcode != OSD_OP_SCRUB ? OSD_OP_RECOVERY_RELATED : 0,
|
||||
};
|
||||
#ifdef OSD_DEBUG
|
||||
printf(
|
||||
"Submit %s to osd %ju: %jx:%jx v%ju %u-%u\n", wr ? "write" : "read", role_osd_num,
|
||||
inode, op_data->oid.stripe | stripe_num, op_version,
|
||||
subop->req.sec_rw.offset, subop->req.sec_rw.len
|
||||
);
|
||||
#endif
|
||||
if (wr)
|
||||
{
|
||||
if (si->write_end > si->write_start)
|
||||
{
|
||||
subop->iov.push_back(si->write_buf, si->write_end - si->write_start);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if (subop_len > 0)
|
||||
{
|
||||
subop->iov.push_back(si->read_buf, subop_len);
|
||||
}
|
||||
}
|
||||
subop->callback = [cur_op, this](osd_op_t *subop)
|
||||
{
|
||||
handle_primary_subop(subop, cur_op);
|
||||
};
|
||||
auto peer_fd_it = msgr.osd_peer_fds.find(role_osd_num);
|
||||
if (peer_fd_it != msgr.osd_peer_fds.end())
|
||||
{
|
||||
subop->peer_fd = peer_fd_it->second;
|
||||
msgr.outbox_push(subop);
|
||||
}
|
||||
else
|
||||
{
|
||||
// Fail it immediately
|
||||
subop->peer_fd = -1;
|
||||
subop->reply.hdr.retval = -EPIPE;
|
||||
ringloop->set_immediate([subop]() { std::function<void(osd_op_t*)>(subop->callback)(subop); });
|
||||
}
|
||||
}
|
||||
si->role = stripe_num;
|
||||
submit_primary_subop(cur_op, &op_data->subops[i], si, wr, inode, op_version);
|
||||
i++;
|
||||
}
|
||||
else
|
||||
|
@ -281,6 +181,112 @@ int osd_t::submit_primary_subop_batch(int submit_type, inode_t inode, uint64_t o
|
|||
return i-subop_idx;
|
||||
}
|
||||
|
||||
void osd_t::submit_primary_subop(osd_op_t *cur_op, osd_op_t *subop,
|
||||
osd_rmw_stripe_t *si, bool wr, inode_t inode, uint64_t op_version)
|
||||
{
|
||||
uint32_t subop_len = wr
|
||||
? si->write_end - si->write_start
|
||||
: si->read_end - si->read_start;
|
||||
if (!wr && si->read_end == UINT32_MAX)
|
||||
{
|
||||
subop_len = 0;
|
||||
}
|
||||
si->read_error = false;
|
||||
subop->bitmap = si->bmp_buf;
|
||||
subop->bitmap_len = clean_entry_bitmap_size;
|
||||
// Using rmw_buf to pass pointer to stripes. Dirty but works
|
||||
subop->rmw_buf = si;
|
||||
if (si->osd_num == this->osd_num)
|
||||
{
|
||||
clock_gettime(CLOCK_REALTIME, &subop->tv_begin);
|
||||
subop->op_type = (uint64_t)cur_op; // also dirty
|
||||
subop->bs_op = new blockstore_op_t((blockstore_op_t){
|
||||
.opcode = (uint64_t)(wr ? (cur_op->op_data->pg->scheme == POOL_SCHEME_REPLICATED ? BS_OP_WRITE_STABLE : BS_OP_WRITE) : BS_OP_READ),
|
||||
.callback = [subop, this](blockstore_op_t *bs_subop)
|
||||
{
|
||||
handle_primary_bs_subop(subop);
|
||||
},
|
||||
{ {
|
||||
.oid = (object_id){
|
||||
.inode = inode,
|
||||
.stripe = cur_op->op_data->oid.stripe | si->role,
|
||||
},
|
||||
.version = op_version,
|
||||
.offset = wr ? si->write_start : si->read_start,
|
||||
.len = subop_len,
|
||||
} },
|
||||
.buf = wr ? si->write_buf : si->read_buf,
|
||||
.bitmap = si->bmp_buf,
|
||||
});
|
||||
#ifdef OSD_DEBUG
|
||||
printf(
|
||||
"Submit %s to local: %jx:%jx v%ju %u-%u\n", wr ? "write" : "read",
|
||||
inode, op_data->oid.stripe | si->role, op_version,
|
||||
subop->bs_op->offset, subop->bs_op->len
|
||||
);
|
||||
#endif
|
||||
bs->enqueue_op(subop->bs_op);
|
||||
}
|
||||
else
|
||||
{
|
||||
subop->op_type = OSD_OP_OUT;
|
||||
subop->req.sec_rw = (osd_op_sec_rw_t){
|
||||
.header = {
|
||||
.magic = SECONDARY_OSD_OP_MAGIC,
|
||||
.id = msgr.next_subop_id++,
|
||||
.opcode = (uint64_t)(wr ? (cur_op->op_data->pg->scheme == POOL_SCHEME_REPLICATED ? OSD_OP_SEC_WRITE_STABLE : OSD_OP_SEC_WRITE) : OSD_OP_SEC_READ),
|
||||
},
|
||||
.oid = {
|
||||
.inode = inode,
|
||||
.stripe = cur_op->op_data->oid.stripe | si->role,
|
||||
},
|
||||
.version = op_version,
|
||||
.offset = wr ? si->write_start : si->read_start,
|
||||
.len = subop_len,
|
||||
.attr_len = wr ? clean_entry_bitmap_size : 0,
|
||||
.flags = cur_op->peer_fd == SELF_FD && cur_op->req.hdr.opcode != OSD_OP_SCRUB ? OSD_OP_RECOVERY_RELATED : 0,
|
||||
};
|
||||
#ifdef OSD_DEBUG
|
||||
printf(
|
||||
"Submit %s to osd %ju: %jx:%jx v%ju %u-%u\n", wr ? "write" : "read", si->osd_num,
|
||||
inode, op_data->oid.stripe | si->role, op_version,
|
||||
subop->req.sec_rw.offset, subop->req.sec_rw.len
|
||||
);
|
||||
#endif
|
||||
if (wr)
|
||||
{
|
||||
if (si->write_end > si->write_start)
|
||||
{
|
||||
subop->iov.push_back(si->write_buf, si->write_end - si->write_start);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if (subop_len > 0)
|
||||
{
|
||||
subop->iov.push_back(si->read_buf, subop_len);
|
||||
}
|
||||
}
|
||||
subop->callback = [cur_op, this](osd_op_t *subop)
|
||||
{
|
||||
handle_primary_subop(subop, cur_op);
|
||||
};
|
||||
auto peer_fd_it = msgr.osd_peer_fds.find(si->osd_num);
|
||||
if (peer_fd_it != msgr.osd_peer_fds.end())
|
||||
{
|
||||
subop->peer_fd = peer_fd_it->second;
|
||||
msgr.outbox_push(subop);
|
||||
}
|
||||
else
|
||||
{
|
||||
// Fail it immediately
|
||||
subop->peer_fd = -1;
|
||||
subop->reply.hdr.retval = -EPIPE;
|
||||
ringloop->set_immediate([subop]() { std::function<void(osd_op_t*)>(subop->callback)(subop); });
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static uint64_t bs_op_to_osd_op[] = {
|
||||
0,
|
||||
OSD_OP_SEC_READ, // BS_OP_READ = 1
|
||||
|
@ -401,7 +407,7 @@ void osd_t::handle_primary_subop(osd_op_t *subop, osd_op_t *cur_op)
|
|||
printf("subop %s %jx:%jx from osd %jd: version = %ju\n", osd_op_names[opcode],
|
||||
subop->req.sec_rw.oid.inode, subop->req.sec_rw.oid.stripe, peer_osd, version);
|
||||
#endif
|
||||
if (op_data->fact_ver != UINT64_MAX)
|
||||
if (version != 0 && op_data->fact_ver != UINT64_MAX)
|
||||
{
|
||||
if (op_data->fact_ver != 0 && op_data->fact_ver != version)
|
||||
{
|
||||
|
@ -526,7 +532,7 @@ bool contains_osd(osd_num_t *osd_set, uint64_t size, osd_num_t osd_num)
|
|||
void osd_t::submit_primary_del_subops(osd_op_t *cur_op, osd_num_t *cur_set, uint64_t set_size, pg_osd_set_t & loc_set)
|
||||
{
|
||||
osd_primary_op_data_t *op_data = cur_op->op_data;
|
||||
bool rep = op_data->scheme == POOL_SCHEME_REPLICATED;
|
||||
bool rep = op_data->pg->scheme == POOL_SCHEME_REPLICATED;
|
||||
obj_ver_osd_t extra_chunks[loc_set.size()];
|
||||
int chunks_to_del = 0;
|
||||
for (auto & chunk: loc_set)
|
||||
|
@ -738,10 +744,10 @@ void osd_t::submit_primary_rollback_subops(osd_op_t *cur_op, const uint64_t* osd
|
|||
{
|
||||
osd_primary_op_data_t *op_data = cur_op->op_data;
|
||||
osd_rmw_stripe_t *stripes = op_data->stripes;
|
||||
assert(op_data->scheme != POOL_SCHEME_REPLICATED);
|
||||
assert(op_data->pg->scheme != POOL_SCHEME_REPLICATED);
|
||||
// Allocate subops
|
||||
int n_subops = 0;
|
||||
for (int role = 0; role < op_data->pg_size; role++)
|
||||
for (int role = 0; role < op_data->pg->pg_size; role++)
|
||||
{
|
||||
if (osd_set[role] != 0 && !stripes[role].read_error &&
|
||||
(osd_set[role] == this->osd_num || msgr.osd_peer_fds.find(osd_set[role]) != msgr.osd_peer_fds.end()))
|
||||
|
@ -758,7 +764,7 @@ void osd_t::submit_primary_rollback_subops(osd_op_t *cur_op, const uint64_t* osd
|
|||
op_data->subops = new osd_op_t[n_subops];
|
||||
op_data->unstable_writes = new obj_ver_id[n_subops];
|
||||
int i = 0;
|
||||
for (int role = 0; role < op_data->pg_size; role++)
|
||||
for (int role = 0; role < op_data->pg->pg_size; role++)
|
||||
{
|
||||
if (osd_set[role] != 0 && !stripes[role].read_error &&
|
||||
(osd_set[role] == this->osd_num || msgr.osd_peer_fds.find(osd_set[role]) != msgr.osd_peer_fds.end()))
|
||||
|
|
|
@ -44,7 +44,7 @@ void osd_t::continue_primary_write(osd_op_t *cur_op)
|
|||
return;
|
||||
}
|
||||
osd_primary_op_data_t *op_data = cur_op->op_data;
|
||||
auto & pg = pgs.at({ .pool_id = INODE_POOL(op_data->oid.inode), .pg_num = op_data->pg_num });
|
||||
auto & pg = *cur_op->op_data->pg;
|
||||
if (op_data->st == 1) goto resume_1;
|
||||
else if (op_data->st == 2) goto resume_2;
|
||||
else if (op_data->st == 3) goto resume_3;
|
||||
|
@ -73,7 +73,7 @@ resume_1:
|
|||
op_data->object_state->ref_count++;
|
||||
}
|
||||
retry_1:
|
||||
if (op_data->scheme == POOL_SCHEME_REPLICATED)
|
||||
if (pg.scheme == POOL_SCHEME_REPLICATED)
|
||||
{
|
||||
// Simplified algorithm
|
||||
op_data->stripes[0].write_start = op_data->stripes[0].req_start;
|
||||
|
@ -99,7 +99,7 @@ retry_1:
|
|||
{
|
||||
assert(!cur_op->rmw_buf);
|
||||
cur_op->rmw_buf = calc_rmw(cur_op->buf, op_data->stripes, op_data->prev_set,
|
||||
pg.pg_size, op_data->pg_data_size, pg.pg_cursize, pg.cur_set.data(), bs_block_size, clean_entry_bitmap_size);
|
||||
pg.pg_size, pg.pg_data_size, pg.pg_cursize, pg.cur_set.data(), bs_block_size, clean_entry_bitmap_size);
|
||||
if (!cur_op->rmw_buf)
|
||||
{
|
||||
// Refuse partial overwrite of an incomplete object
|
||||
|
@ -114,7 +114,7 @@ retry_1:
|
|||
// Allow to read version number (just version number!) from corrupted chunks
|
||||
// to allow full overwrite of a corrupted object
|
||||
bool found = false;
|
||||
for (int role = 0; role < op_data->pg_size; role++)
|
||||
for (int role = 0; role < pg.pg_size; role++)
|
||||
{
|
||||
if (op_data->prev_set[role] != 0 || op_data->stripes[role].read_end > op_data->stripes[role].read_start)
|
||||
{
|
||||
|
@ -124,8 +124,8 @@ retry_1:
|
|||
}
|
||||
if (!found)
|
||||
{
|
||||
osd_num_t corrupted_target[op_data->pg_size];
|
||||
for (int role = 0; role < op_data->pg_size; role++)
|
||||
osd_num_t corrupted_target[pg.pg_size];
|
||||
for (int role = 0; role < pg.pg_size; role++)
|
||||
{
|
||||
corrupted_target[role] = 0;
|
||||
}
|
||||
|
@ -151,7 +151,7 @@ resume_3:
|
|||
if (op_data->errcode == -EIO || op_data->errcode == -EDOM)
|
||||
{
|
||||
// Mark object corrupted and retry
|
||||
op_data->object_state = mark_object_corrupted(pg, op_data->oid, op_data->object_state, op_data->stripes, true, false);
|
||||
op_data->object_state = mark_object_corrupted(pg, op_data->oid, op_data->object_state, op_data->stripes, true);
|
||||
op_data->prev_set = op_data->object_state ? op_data->object_state->read_target.data() : pg.cur_set.data();
|
||||
if (cur_op->rmw_buf)
|
||||
{
|
||||
|
@ -165,6 +165,7 @@ resume_3:
|
|||
return;
|
||||
}
|
||||
// Check CAS version
|
||||
// FIXME: Handle CAS writes as "immediate" in non-immediate_commit pools, otherwise CAS doesn't make sense
|
||||
if (cur_op->req.rw.version && op_data->fact_ver != (cur_op->req.rw.version-1))
|
||||
{
|
||||
deref_object_state(pg, &op_data->object_state, true);
|
||||
|
@ -172,7 +173,7 @@ resume_3:
|
|||
cur_op->reply.rw.version = op_data->fact_ver;
|
||||
goto continue_others;
|
||||
}
|
||||
if (op_data->scheme == POOL_SCHEME_REPLICATED)
|
||||
if (pg.scheme == POOL_SCHEME_REPLICATED)
|
||||
{
|
||||
// Set bitmap bits
|
||||
bitmap_set(op_data->stripes[0].bmp_buf, op_data->stripes[0].write_start,
|
||||
|
@ -203,7 +204,7 @@ resume_3:
|
|||
}
|
||||
else if (pg.scheme == POOL_SCHEME_EC)
|
||||
{
|
||||
calc_rmw_parity_ec(op_data->stripes, pg.pg_size, op_data->pg_data_size, op_data->prev_set, pg.cur_set.data(), bs_block_size, clean_entry_bitmap_size);
|
||||
calc_rmw_parity_ec(op_data->stripes, pg.pg_size, pg.pg_data_size, op_data->prev_set, pg.cur_set.data(), bs_block_size, clean_entry_bitmap_size);
|
||||
}
|
||||
}
|
||||
// Send writes
|
||||
|
@ -269,7 +270,7 @@ resume_5:
|
|||
// and rollback successful part updates in case of EC.
|
||||
if (op_data->done > 0 && !op_data->drops)
|
||||
{
|
||||
if (op_data->scheme != POOL_SCHEME_REPLICATED)
|
||||
if (pg.scheme != POOL_SCHEME_REPLICATED)
|
||||
{
|
||||
submit_primary_rollback_subops(cur_op, pg.cur_set.data());
|
||||
resume_11:
|
||||
|
@ -293,7 +294,7 @@ resume_12:
|
|||
pg_cancel_write_queue(pg, cur_op, op_data->oid, op_data->errcode);
|
||||
return;
|
||||
}
|
||||
if (op_data->scheme != POOL_SCHEME_REPLICATED)
|
||||
if (pg.scheme != POOL_SCHEME_REPLICATED)
|
||||
{
|
||||
// Remove version override just after the write, but before stabilizing
|
||||
pg.ver_override.erase(op_data->oid);
|
||||
|
@ -329,7 +330,7 @@ resume_7:
|
|||
memset(&recovery_stat[recovery_type], 0, sizeof(recovery_stat[recovery_type]));
|
||||
recovery_stat[recovery_type].count++;
|
||||
}
|
||||
for (int role = 0; role < (op_data->scheme == POOL_SCHEME_REPLICATED ? 1 : pg.pg_size); role++)
|
||||
for (int role = 0; role < (pg.scheme == POOL_SCHEME_REPLICATED ? 1 : pg.pg_size); role++)
|
||||
{
|
||||
recovery_stat[recovery_type].bytes += op_data->stripes[role].write_end - op_data->stripes[role].write_start;
|
||||
}
|
||||
|
@ -353,7 +354,7 @@ resume_7:
|
|||
for (auto & chunk: op_data->object_state->osd_set)
|
||||
{
|
||||
// Check is the same as in submit_primary_del_subops()
|
||||
if (op_data->scheme == POOL_SCHEME_REPLICATED
|
||||
if (pg.scheme == POOL_SCHEME_REPLICATED
|
||||
? !contains_osd(pg.cur_set.data(), pg.pg_size, chunk.osd_num)
|
||||
: (chunk.osd_num != pg.cur_set[chunk.role]))
|
||||
{
|
||||
|
@ -361,7 +362,7 @@ resume_7:
|
|||
.osd_num = chunk.osd_num,
|
||||
.oid = {
|
||||
.inode = op_data->oid.inode,
|
||||
.stripe = op_data->oid.stripe | (op_data->scheme == POOL_SCHEME_REPLICATED ? 0 : chunk.role),
|
||||
.stripe = op_data->oid.stripe | (pg.scheme == POOL_SCHEME_REPLICATED ? 0 : chunk.role),
|
||||
},
|
||||
.version = op_data->fact_ver,
|
||||
});
|
||||
|
@ -472,7 +473,7 @@ bool osd_t::remember_unstable_write(osd_op_t *cur_op, pg_t & pg, pg_osd_set_t &
|
|||
if (immediate_commit == IMMEDIATE_ALL)
|
||||
{
|
||||
immediate:
|
||||
if (op_data->scheme != POOL_SCHEME_REPLICATED)
|
||||
if (pg.scheme != POOL_SCHEME_REPLICATED)
|
||||
{
|
||||
// Send STABILIZE ops immediately
|
||||
op_data->unstable_write_osds = new std::vector<unstable_osd_num_t>();
|
||||
|
@ -515,7 +516,7 @@ resume_7:
|
|||
}
|
||||
else if (immediate_commit == IMMEDIATE_SMALL)
|
||||
{
|
||||
int stripe_count = (op_data->scheme == POOL_SCHEME_REPLICATED ? 1 : op_data->pg_size);
|
||||
int stripe_count = (pg.scheme == POOL_SCHEME_REPLICATED ? 1 : pg.pg_size);
|
||||
for (int role = 0; role < stripe_count; role++)
|
||||
{
|
||||
if (op_data->stripes[role].write_start == 0 &&
|
||||
|
@ -531,7 +532,7 @@ resume_7:
|
|||
{
|
||||
lazy:
|
||||
unstable_write_count++;
|
||||
if (op_data->scheme != POOL_SCHEME_REPLICATED)
|
||||
if (pg.scheme != POOL_SCHEME_REPLICATED)
|
||||
{
|
||||
// Remember version as unstable for EC/XOR
|
||||
for (auto & chunk: loc_set)
|
||||
|
|
|
@ -1118,139 +1118,230 @@ static bool next_combination(int *subset, int k, int n)
|
|||
return true;
|
||||
}
|
||||
|
||||
static int c_n_k(int n, int k)
|
||||
static uint64_t c_n_k(uint64_t n, uint64_t k)
|
||||
{
|
||||
int c = 1;
|
||||
for (int i = n; i > k; i--)
|
||||
uint64_t c = 1;
|
||||
for (uint64_t i = n; i > k; i--)
|
||||
{
|
||||
if ((c*i) < i)
|
||||
return UINT64_MAX;
|
||||
c *= i;
|
||||
for (int i = 2; i <= (n-k); i++)
|
||||
}
|
||||
for (uint64_t i = 2; i <= (n-k); i++)
|
||||
c /= i;
|
||||
return c;
|
||||
}
|
||||
|
||||
std::vector<int> ec_find_good(osd_rmw_stripe_t *stripes, int pg_size, int pg_minsize, bool is_xor,
|
||||
uint32_t chunk_size, uint32_t bitmap_size, int max_bruteforce)
|
||||
static std::vector<int> ec_check_combination(osd_rmw_stripe_t *stripes, int stripe_count,
|
||||
int *subset, int pg_size, int pg_minsize, bool is_xor,
|
||||
uint32_t chunk_size, uint32_t bitmap_size, uint8_t *tmp_buf)
|
||||
{
|
||||
osd_num_t fake_osd_set[pg_size];
|
||||
for (int i = 0; i < pg_size; i++)
|
||||
{
|
||||
fake_osd_set[i] = i+1;
|
||||
}
|
||||
osd_rmw_stripe_t brute_stripes[pg_size];
|
||||
memset(brute_stripes, 0, sizeof(osd_rmw_stripe_t)*pg_size);
|
||||
for (int i = 0; i < pg_size; i++)
|
||||
{
|
||||
auto & bs = brute_stripes[i];
|
||||
bs.req_end = bs.read_end = chunk_size;
|
||||
}
|
||||
for (int i = 0; i < pg_minsize; i++)
|
||||
{
|
||||
auto & src = stripes[subset[i]];
|
||||
auto & bs = brute_stripes[src.role];
|
||||
bs.bmp_buf = src.bmp_buf;
|
||||
bs.write_buf = bs.read_buf = src.read_buf;
|
||||
}
|
||||
for (int i = 0; i < pg_size; i++)
|
||||
{
|
||||
auto & bs = brute_stripes[i];
|
||||
if (!bs.read_buf)
|
||||
{
|
||||
// missing chunks are recovered in read_bufs and write_bufs are used as source for parity
|
||||
bs.missing = true;
|
||||
bs.read_buf = bs.write_buf = tmp_buf+i*chunk_size;
|
||||
bs.bmp_buf = tmp_buf + stripe_count*chunk_size + i*bitmap_size;
|
||||
}
|
||||
else if (i >= pg_minsize)
|
||||
{
|
||||
// parity chunks are regenerated in their write_bufs, so use a temporary buffer
|
||||
bs.write_buf = tmp_buf+i*chunk_size;
|
||||
}
|
||||
}
|
||||
if (is_xor)
|
||||
{
|
||||
assert(pg_size == pg_minsize+1);
|
||||
reconstruct_stripes_xor(brute_stripes, pg_size, bitmap_size);
|
||||
}
|
||||
else
|
||||
{
|
||||
reconstruct_stripes_ec(brute_stripes, pg_size, pg_minsize, bitmap_size);
|
||||
calc_rmw_parity_ec(brute_stripes, pg_size, pg_minsize, fake_osd_set, fake_osd_set, chunk_size, bitmap_size);
|
||||
}
|
||||
bool matched_other = false;
|
||||
std::vector<int> good_set;
|
||||
for (int i = 0; i < stripe_count; i++)
|
||||
{
|
||||
if (stripes[i].read_error || stripes[i].not_exists)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
auto & bs = brute_stripes[stripes[i].role];
|
||||
if (!bs.missing && bs.read_buf == stripes[i].read_buf)
|
||||
{
|
||||
// source chunk, mark OK
|
||||
good_set.push_back(i);
|
||||
}
|
||||
else if (memcmp(stripes[i].role < pg_minsize ? bs.read_buf : bs.write_buf, stripes[i].read_buf, chunk_size) == 0)
|
||||
{
|
||||
// matching chunk, mark OK
|
||||
good_set.push_back(i);
|
||||
matched_other = true;
|
||||
}
|
||||
}
|
||||
if (!matched_other)
|
||||
{
|
||||
good_set.clear();
|
||||
}
|
||||
return good_set;
|
||||
}
|
||||
|
||||
static int count_roles(osd_rmw_stripe_t *stripes, std::vector<int> & valid_chunks, int pg_size)
|
||||
{
|
||||
bool role_ok[pg_size];
|
||||
for (int i = 0; i < pg_size; i++)
|
||||
{
|
||||
role_ok[i] = false;
|
||||
}
|
||||
for (int idx: valid_chunks)
|
||||
{
|
||||
role_ok[stripes[idx].role] = true;
|
||||
}
|
||||
int ok_count = 0;
|
||||
for (int i = 0; i < pg_size; i++)
|
||||
{
|
||||
if (role_ok[i])
|
||||
ok_count++;
|
||||
}
|
||||
return ok_count;
|
||||
}
|
||||
|
||||
std::vector<int> ec_find_good(osd_rmw_stripe_t *stripes, int stripe_count, int pg_size, int pg_minsize, bool is_xor,
|
||||
uint32_t chunk_size, uint32_t bitmap_size, uint64_t max_bruteforce, bool find_best)
|
||||
{
|
||||
std::vector<int> found_valid;
|
||||
int cur_live[pg_size], live_count = 0, exists_count = 0;
|
||||
osd_num_t fake_osd_set[pg_size];
|
||||
for (int role = 0; role < pg_size; role++)
|
||||
std::vector<std::vector<int>> live_variants(pg_size);
|
||||
int eq_to[stripe_count];
|
||||
int live_roles = 0, live_total = 0;
|
||||
for (int i = 0; i < pg_size; i++)
|
||||
{
|
||||
if (!stripes[role].missing)
|
||||
{
|
||||
if (!stripes[role].not_exists)
|
||||
exists_count++;
|
||||
cur_live[live_count++] = role;
|
||||
fake_osd_set[role] = role+1;
|
||||
}
|
||||
eq_to[i] = i;
|
||||
}
|
||||
if (live_count <= pg_minsize)
|
||||
for (int i = 0; i < stripe_count; i++)
|
||||
{
|
||||
return std::vector<int>();
|
||||
}
|
||||
if (exists_count <= pg_minsize)
|
||||
{
|
||||
// Special case: user manually deleted some chunks
|
||||
for (int role = 0; role < pg_size; role++)
|
||||
if (!stripes[role].missing && !stripes[role].not_exists)
|
||||
found_valid.push_back(role);
|
||||
return found_valid;
|
||||
}
|
||||
// Try to locate errors using brute force if there isn't too many combinations
|
||||
osd_rmw_stripe_t brute_stripes[pg_size];
|
||||
int out_count = live_count-pg_minsize;
|
||||
bool brute_force = out_count > 1 && c_n_k(live_count-1, out_count-1) <= max_bruteforce;
|
||||
int subset[pg_minsize], outset[out_count];
|
||||
// Select all combinations with items except the last one (== anything to compare)
|
||||
first_combination(subset, pg_minsize, live_count-1);
|
||||
uint8_t *tmp_buf = (uint8_t*)malloc_or_die(pg_size*chunk_size);
|
||||
do
|
||||
{
|
||||
memcpy(brute_stripes, stripes, sizeof(osd_rmw_stripe_t)*pg_size);
|
||||
int i = 0, j = 0, k = 0;
|
||||
for (; i < pg_minsize; i++, j++)
|
||||
while (j < subset[i])
|
||||
outset[k++] = j++;
|
||||
while (j < pg_size)
|
||||
outset[k++] = j++;
|
||||
for (int i = 0; i < out_count; i++)
|
||||
if (!stripes[i].read_error && !stripes[i].not_exists)
|
||||
{
|
||||
brute_stripes[cur_live[outset[i]]].missing = true;
|
||||
brute_stripes[cur_live[outset[i]]].read_buf = tmp_buf+cur_live[outset[i]]*chunk_size;
|
||||
}
|
||||
for (int i = 0; i < pg_minsize; i++)
|
||||
{
|
||||
brute_stripes[i].write_buf = brute_stripes[i].read_buf;
|
||||
brute_stripes[i].req_start = 0;
|
||||
brute_stripes[i].req_end = chunk_size;
|
||||
}
|
||||
for (int i = pg_minsize; i < pg_size; i++)
|
||||
{
|
||||
brute_stripes[i].write_buf = tmp_buf+i*chunk_size;
|
||||
}
|
||||
if (is_xor)
|
||||
{
|
||||
assert(pg_size == pg_minsize+1);
|
||||
reconstruct_stripes_xor(brute_stripes, pg_size, bitmap_size);
|
||||
}
|
||||
else
|
||||
{
|
||||
reconstruct_stripes_ec(brute_stripes, pg_size, pg_minsize, bitmap_size);
|
||||
calc_rmw_parity_ec(brute_stripes, pg_size, pg_minsize, fake_osd_set, fake_osd_set, chunk_size, bitmap_size);
|
||||
}
|
||||
for (int i = pg_minsize; i < pg_size; i++)
|
||||
{
|
||||
brute_stripes[i].read_buf = brute_stripes[i].write_buf;
|
||||
}
|
||||
int valid_count = 0;
|
||||
for (int i = 0; i < out_count; i++)
|
||||
{
|
||||
if (memcmp(brute_stripes[cur_live[outset[i]]].read_buf,
|
||||
stripes[cur_live[outset[i]]].read_buf, chunk_size) == 0)
|
||||
if (live_variants[stripes[i].role].size() > 0)
|
||||
{
|
||||
brute_stripes[cur_live[outset[i]]].missing = false;
|
||||
valid_count++;
|
||||
}
|
||||
}
|
||||
if (valid_count > 0)
|
||||
{
|
||||
if (found_valid.size())
|
||||
{
|
||||
// Check if we found the same set from the different point of view,
|
||||
// like 1 2 3 -> valid 4 5 and 1 3 4 -> valid 2 5
|
||||
for (int i = 0, j = 0; i < pg_size; i++)
|
||||
for (int j = 0; j < i; j++)
|
||||
{
|
||||
if (!brute_stripes[i].missing)
|
||||
if (stripes[j].role == stripes[i].role &&
|
||||
memcmp(stripes[i].read_buf, stripes[j].read_buf, chunk_size) == 0)
|
||||
{
|
||||
if (j >= found_valid.size() || found_valid[j] != i)
|
||||
{
|
||||
// Ambiguity: we found multiple valid sets and don't know which one is correct
|
||||
found_valid.clear();
|
||||
break;
|
||||
}
|
||||
j++;
|
||||
eq_to[i] = eq_to[j];
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!found_valid.size())
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for (int i = 0; i < pg_size; i++)
|
||||
live_roles++;
|
||||
}
|
||||
if (eq_to[i] == i)
|
||||
{
|
||||
live_variants[stripes[i].role].push_back(i);
|
||||
live_total++;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (live_roles == pg_minsize && live_total > pg_minsize)
|
||||
{
|
||||
// Nothing to validate and there are chunks with different data => object is inconsistent
|
||||
return std::vector<int>();
|
||||
}
|
||||
if (live_roles <= pg_minsize)
|
||||
{
|
||||
// Nothing to validate, just return all live chunks
|
||||
for (int i = 0; i < stripe_count; i++)
|
||||
if (!stripes[i].read_error)
|
||||
found_valid.push_back(i);
|
||||
return found_valid;
|
||||
}
|
||||
// Try to locate errors using brute force if there isn't too many combinations
|
||||
bool brute_force = c_n_k(live_roles, pg_minsize) <= max_bruteforce;
|
||||
int combination[pg_minsize], subset[pg_minsize], subvar[pg_minsize];
|
||||
// To translate 0..live_roles into 0..pg_size
|
||||
int comb_to_subset[live_roles];
|
||||
for (int i = 0, r = 0; i < pg_size; i++)
|
||||
{
|
||||
if (live_variants[i].size() > 0)
|
||||
comb_to_subset[r++] = i;
|
||||
}
|
||||
// Select all combinations with items except the last one (== anything to compare)
|
||||
first_combination(combination, pg_minsize, live_roles);
|
||||
uint8_t *tmp_buf = (uint8_t*)malloc_or_die(stripe_count*(chunk_size+bitmap_size));
|
||||
do
|
||||
{
|
||||
// Then loop over all subvariants (if some roles have multiple diverged variants of data)
|
||||
for (int i = 0; i < pg_minsize; i++)
|
||||
{
|
||||
subvar[i] = 0;
|
||||
}
|
||||
while (true)
|
||||
{
|
||||
// Transform combination[] + subvar[] into subset[]
|
||||
for (int i = 0; i < pg_minsize; i++)
|
||||
{
|
||||
subset[i] = live_variants[comb_to_subset[combination[i]]][subvar[i]];
|
||||
}
|
||||
// Check the combination
|
||||
auto valid_chunks = ec_check_combination(stripes, stripe_count, subset, pg_size, pg_minsize, is_xor, chunk_size, bitmap_size, tmp_buf);
|
||||
// The same set may be found from different points of view,
|
||||
// like 1 2 3 -> valid 4 5 and 1 3 4 -> valid 2 5
|
||||
if (valid_chunks.size() > 0)
|
||||
{
|
||||
if (found_valid.size() >= valid_chunks.size() && found_valid != valid_chunks)
|
||||
{
|
||||
if (!brute_stripes[i].missing)
|
||||
{
|
||||
found_valid.push_back(i);
|
||||
}
|
||||
// Ambiguity: we found multiple valid sets and don't know which one is correct
|
||||
printf("Scrub found 2 different correct chunk subsets: OSD ");
|
||||
for (int i = 0; i < found_valid.size(); i++)
|
||||
printf(i > 0 ? ", %ju" : "%ju", stripes[found_valid[i]].osd_num);
|
||||
printf(" and OSD ");
|
||||
for (int i = 0; i < valid_chunks.size(); i++)
|
||||
printf(i > 0 ? ", %ju" : "%ju", stripes[valid_chunks[i]].osd_num);
|
||||
printf("\n");
|
||||
found_valid.clear();
|
||||
goto out;
|
||||
}
|
||||
else if (!found_valid.size() && (find_best || count_roles(stripes, valid_chunks, pg_size) >= pg_size))
|
||||
{
|
||||
found_valid = valid_chunks;
|
||||
}
|
||||
}
|
||||
if (valid_count == out_count)
|
||||
// Select next subvariant
|
||||
int i = 0;
|
||||
for (i = 0; i < pg_minsize; i++)
|
||||
{
|
||||
// All chunks are good
|
||||
break;
|
||||
subvar[i]++;
|
||||
if (subvar[i] < live_variants[combination[i]].size())
|
||||
break;
|
||||
subvar[i] = 0;
|
||||
}
|
||||
if (i >= pg_minsize)
|
||||
break;
|
||||
}
|
||||
if (!brute_force)
|
||||
{
|
||||
|
@ -1258,7 +1349,8 @@ std::vector<int> ec_find_good(osd_rmw_stripe_t *stripes, int pg_size, int pg_min
|
|||
// if we find it we won't be able to check that it's the only good one
|
||||
break;
|
||||
}
|
||||
} while (out_count > 1 && next_combination(subset, pg_minsize, live_count-1));
|
||||
} while (next_combination(combination, pg_minsize, live_roles));
|
||||
out:
|
||||
free(tmp_buf);
|
||||
return found_valid;
|
||||
}
|
||||
|
|
|
@ -28,6 +28,7 @@ struct osd_rmw_stripe_t
|
|||
uint32_t read_start, read_end;
|
||||
uint32_t write_start, write_end;
|
||||
osd_num_t osd_num;
|
||||
int role;
|
||||
bool missing: 1;
|
||||
bool read_error: 1;
|
||||
bool not_exists: 1;
|
||||
|
@ -57,5 +58,5 @@ void reconstruct_stripes_ec(osd_rmw_stripe_t *stripes, int pg_size, int pg_minsi
|
|||
void calc_rmw_parity_ec(osd_rmw_stripe_t *stripes, int pg_size, int pg_minsize,
|
||||
uint64_t *read_osd_set, uint64_t *write_osd_set, uint32_t chunk_size, uint32_t bitmap_size);
|
||||
|
||||
std::vector<int> ec_find_good(osd_rmw_stripe_t *stripes, int pg_size, int pg_minsize, bool is_xor,
|
||||
uint32_t chunk_size, uint32_t bitmap_size, int max_bruteforce);
|
||||
std::vector<int> ec_find_good(osd_rmw_stripe_t *stripes, int stripe_count, int pg_size, int pg_minsize, bool is_xor,
|
||||
uint32_t chunk_size, uint32_t bitmap_size, uint64_t max_bruteforce, bool find_best);
|
||||
|
|
|
@ -1160,24 +1160,26 @@ void test_ec43_error_bruteforce()
|
|||
stripes[i].read_end = 4096;
|
||||
stripes[i].read_buf = write_buf+i*4096;
|
||||
stripes[i].write_buf = NULL;
|
||||
stripes[i].role = i;
|
||||
stripes[i].osd_num = i+1;
|
||||
}
|
||||
// All good chunks
|
||||
auto res = ec_find_good(stripes, 7, 4, false, 4096, 0, 100);
|
||||
auto res = ec_find_good(stripes, 7, 7, 4, false, 4096, 0, 100, true);
|
||||
assert_eq_vec(res, std::vector<int>({0, 1, 2, 3, 4, 5, 6}));
|
||||
// 1 missing chunk
|
||||
set_pattern(write_buf+1*4096, 4096, 0);
|
||||
res = ec_find_good(stripes, 7, 4, false, 4096, 0, 100);
|
||||
res = ec_find_good(stripes, 7, 7, 4, false, 4096, 0, 100, true);
|
||||
assert_eq_vec(res, std::vector<int>({0, 2, 3, 4, 5, 6}));
|
||||
// 2 missing chunks
|
||||
set_pattern(write_buf+1*4096, 4096, 0);
|
||||
set_pattern(write_buf+5*4096, 4096, 0);
|
||||
res = ec_find_good(stripes, 7, 4, false, 4096, 0, 100);
|
||||
res = ec_find_good(stripes, 7, 7, 4, false, 4096, 0, 100, true);
|
||||
assert_eq_vec(res, std::vector<int>({0, 2, 3, 4, 6}));
|
||||
// 3 missing chunks
|
||||
set_pattern(write_buf+1*4096, 4096, 0);
|
||||
set_pattern(write_buf+5*4096, 4096, 0);
|
||||
set_pattern(write_buf+6*4096, 4096, 0);
|
||||
res = ec_find_good(stripes, 7, 4, false, 4096, 0, 100);
|
||||
res = ec_find_good(stripes, 7, 7, 4, false, 4096, 0, 100, true);
|
||||
assert_eq_vec(res, std::vector<int>());
|
||||
// Done
|
||||
free(rmw_buf);
|
||||
|
|
|
@ -369,114 +369,102 @@ void osd_t::schedule_scrub(pg_t & pg)
|
|||
}
|
||||
}
|
||||
|
||||
void osd_t::continue_primary_scrub(osd_op_t *cur_op)
|
||||
void osd_t::submit_scrub_subops(osd_op_t *cur_op)
|
||||
{
|
||||
if (!cur_op->op_data && !prepare_primary_rw(cur_op))
|
||||
return;
|
||||
osd_primary_op_data_t *op_data = cur_op->op_data;
|
||||
if (op_data->st == 1)
|
||||
goto resume_1;
|
||||
else if (op_data->st == 2)
|
||||
goto resume_2;
|
||||
assert(!op_data->stripe_count);
|
||||
cur_op->req.rw.len = bs_block_size * op_data->pg->pg_data_size;
|
||||
// Determine version
|
||||
auto vo_it = op_data->pg->ver_override.find(op_data->oid);
|
||||
op_data->target_ver = vo_it != op_data->pg->ver_override.end() ? vo_it->second : UINT64_MAX;
|
||||
// Find object state
|
||||
op_data->prev_set = get_object_osd_set(*op_data->pg, op_data->oid, &op_data->object_state);
|
||||
if (!op_data->object_state)
|
||||
{
|
||||
auto & pg = pgs.at({ .pool_id = INODE_POOL(op_data->oid.inode), .pg_num = op_data->pg_num });
|
||||
cur_op->req.rw.len = bs_block_size * pg.pg_data_size;
|
||||
// Determine version
|
||||
auto vo_it = pg.ver_override.find(op_data->oid);
|
||||
op_data->target_ver = vo_it != pg.ver_override.end() ? vo_it->second : UINT64_MAX;
|
||||
// PG may have degraded or misplaced objects
|
||||
op_data->prev_set = get_object_osd_set(pg, op_data->oid, &op_data->object_state);
|
||||
// Read all available chunks
|
||||
int n_copies = 0;
|
||||
op_data->degraded = false;
|
||||
for (int role = 0; role < op_data->pg_size; role++)
|
||||
op_data->stripe_count = op_data->pg->pg_size;
|
||||
op_data->stripes = (osd_rmw_stripe_t*)calloc_or_die(op_data->stripe_count, sizeof(osd_rmw_stripe_t));
|
||||
for (int i = 0; i < op_data->pg->pg_size; i++)
|
||||
{
|
||||
op_data->stripes[role].write_buf = NULL;
|
||||
op_data->stripes[role].read_start = 0;
|
||||
op_data->stripes[role].read_end = bs_block_size;
|
||||
if (op_data->prev_set[role] != 0)
|
||||
{
|
||||
n_copies++;
|
||||
}
|
||||
else
|
||||
{
|
||||
op_data->stripes[role].missing = true;
|
||||
if (op_data->scheme != POOL_SCHEME_REPLICATED && role < op_data->pg_data_size)
|
||||
{
|
||||
op_data->degraded = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (n_copies <= op_data->pg_data_size)
|
||||
{
|
||||
// Nothing to compare, even if we'd like to
|
||||
finish_op(cur_op, 0);
|
||||
return;
|
||||
}
|
||||
cur_op->buf = alloc_read_buffer(op_data->stripes, op_data->pg_size, 0);
|
||||
// Submit reads
|
||||
osd_op_t *subops = new osd_op_t[n_copies];
|
||||
op_data->fact_ver = 0;
|
||||
op_data->done = op_data->errors = op_data->errcode = 0;
|
||||
op_data->n_subops = n_copies;
|
||||
op_data->subops = subops;
|
||||
int sent = submit_primary_subop_batch(SUBMIT_SCRUB_READ, op_data->oid.inode, op_data->target_ver,
|
||||
op_data->stripes, op_data->prev_set, cur_op, 0, -1);
|
||||
assert(sent == n_copies);
|
||||
op_data->st = 1;
|
||||
}
|
||||
resume_1:
|
||||
return;
|
||||
resume_2:
|
||||
if (op_data->errors > 0)
|
||||
{
|
||||
if (op_data->errcode == -EIO || op_data->errcode == -EDOM)
|
||||
{
|
||||
// I/O or checksum error
|
||||
int n_copies = 0;
|
||||
for (int role = 0; role < op_data->pg_size; role++)
|
||||
{
|
||||
if (op_data->stripes[role].read_error)
|
||||
{
|
||||
op_data->stripes[role].missing = true;
|
||||
if (op_data->scheme != POOL_SCHEME_REPLICATED && role < op_data->pg_data_size)
|
||||
{
|
||||
op_data->degraded = true;
|
||||
}
|
||||
}
|
||||
else if (!op_data->stripes[role].missing)
|
||||
{
|
||||
n_copies++;
|
||||
}
|
||||
}
|
||||
if (n_copies <= op_data->pg_data_size)
|
||||
{
|
||||
// Nothing to compare, just mark the object as corrupted
|
||||
auto & pg = pgs.at({ .pool_id = INODE_POOL(op_data->oid.inode), .pg_num = op_data->pg_num });
|
||||
// FIXME: ref = true ideally... because new_state != state is not necessarily true if it's freed and recreated
|
||||
op_data->object_state = mark_object_corrupted(pg, op_data->oid, op_data->object_state, op_data->stripes, false, false);
|
||||
// Operation is treated as unsuccessful only if the object becomes unreadable
|
||||
finish_op(cur_op, n_copies < op_data->pg_data_size ? op_data->errcode : 0);
|
||||
return;
|
||||
}
|
||||
// Proceed, we can still compare chunks that were successfully read
|
||||
}
|
||||
else
|
||||
{
|
||||
finish_op(cur_op, op_data->errcode);
|
||||
return;
|
||||
op_data->stripes[i].osd_num = op_data->prev_set[i];
|
||||
op_data->stripes[i].role = (op_data->pg->scheme == POOL_SCHEME_REPLICATED ? 0 : i);
|
||||
op_data->stripes[i].read_end = bs_block_size;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
op_data->stripe_count = 0;
|
||||
for (auto & chunk: op_data->object_state->osd_set)
|
||||
{
|
||||
// Read all chunks except outdated
|
||||
if (!(chunk.loc_bad & LOC_OUTDATED))
|
||||
op_data->stripe_count++;
|
||||
}
|
||||
op_data->stripes = (osd_rmw_stripe_t*)calloc_or_die(op_data->stripe_count, sizeof(osd_rmw_stripe_t));
|
||||
int i = 0;
|
||||
for (auto & chunk: op_data->object_state->osd_set)
|
||||
{
|
||||
if (!(chunk.loc_bad & LOC_OUTDATED))
|
||||
{
|
||||
op_data->stripes[i].osd_num = chunk.osd_num;
|
||||
op_data->stripes[i].role = chunk.role;
|
||||
op_data->stripes[i].read_end = bs_block_size;
|
||||
i++;
|
||||
}
|
||||
}
|
||||
}
|
||||
assert(!cur_op->bitmap_buf);
|
||||
cur_op->bitmap_buf = calloc_or_die(1, clean_entry_bitmap_size * op_data->stripe_count);
|
||||
for (int i = 0; i < op_data->stripe_count; i++)
|
||||
{
|
||||
op_data->stripes[i].bmp_buf = (uint8_t*)cur_op->bitmap_buf + clean_entry_bitmap_size * i;
|
||||
}
|
||||
cur_op->buf = alloc_read_buffer(op_data->stripes, op_data->stripe_count, 0);
|
||||
op_data->fact_ver = 0;
|
||||
op_data->done = op_data->errors = op_data->errcode = 0;
|
||||
op_data->n_subops = op_data->stripe_count;
|
||||
op_data->subops = new osd_op_t[op_data->stripe_count];
|
||||
op_data->st = 1;
|
||||
for (int i = 0; i < op_data->stripe_count; i++)
|
||||
{
|
||||
submit_primary_subop(cur_op, &op_data->subops[i], &op_data->stripes[i],
|
||||
false, op_data->oid.inode, op_data->target_ver);
|
||||
}
|
||||
}
|
||||
|
||||
// The idea is that scrub should not only find out if the object
|
||||
// is corrupted, but it should also verify availability of all copies
|
||||
void osd_t::scrub_check_results(osd_op_t *cur_op)
|
||||
{
|
||||
osd_primary_op_data_t *op_data = cur_op->op_data;
|
||||
bool inconsistent = false;
|
||||
if (op_data->scheme == POOL_SCHEME_REPLICATED)
|
||||
int total = 0;
|
||||
for (int role = 0; role < op_data->stripe_count; role++)
|
||||
{
|
||||
if (!op_data->stripes[role].not_exists)
|
||||
total++;
|
||||
}
|
||||
if (!total)
|
||||
{
|
||||
// Object is deleted manually from all OSDs, forget it
|
||||
printf(
|
||||
"[PG %u/%u] Scrub detected a deleted object %jx:%jx\n",
|
||||
INODE_POOL(op_data->oid.inode), op_data->pg_num,
|
||||
op_data->oid.inode, op_data->oid.stripe
|
||||
);
|
||||
remove_object_from_state(op_data->oid, &op_data->object_state, *op_data->pg, false);
|
||||
deref_object_state(*op_data->pg, &op_data->object_state, true);
|
||||
return;
|
||||
}
|
||||
if (op_data->pg->scheme == POOL_SCHEME_REPLICATED)
|
||||
{
|
||||
// Check that all chunks have returned the same data
|
||||
int total = 0;
|
||||
int eq_to[op_data->pg_size];
|
||||
for (int role = 0; role < op_data->pg_size; role++)
|
||||
int eq_to[op_data->stripe_count];
|
||||
for (int role = 0; role < op_data->stripe_count; role++)
|
||||
{
|
||||
eq_to[role] = -1;
|
||||
if (op_data->stripes[role].read_end != 0 && !op_data->stripes[role].missing &&
|
||||
if (op_data->stripes[role].read_end != 0 &&
|
||||
!op_data->stripes[role].read_error &&
|
||||
!op_data->stripes[role].not_exists)
|
||||
{
|
||||
total++;
|
||||
|
@ -492,16 +480,16 @@ resume_2:
|
|||
}
|
||||
}
|
||||
}
|
||||
int votes[op_data->pg_size];
|
||||
for (int role = 0; role < op_data->pg_size; role++)
|
||||
int votes[op_data->stripe_count];
|
||||
for (int role = 0; role < op_data->stripe_count; role++)
|
||||
votes[role] = 0;
|
||||
for (int role = 0; role < op_data->pg_size; role++)
|
||||
for (int role = 0; role < op_data->stripe_count; role++)
|
||||
{
|
||||
if (eq_to[role] != -1)
|
||||
votes[eq_to[role]]++;
|
||||
}
|
||||
int best = -1;
|
||||
for (int role = 0; role < op_data->pg_size; role++)
|
||||
for (int role = 0; role < op_data->stripe_count; role++)
|
||||
{
|
||||
if (votes[role] > (best >= 0 ? votes[best] : 0))
|
||||
best = role;
|
||||
|
@ -509,7 +497,7 @@ resume_2:
|
|||
if (best >= 0 && votes[best] < total)
|
||||
{
|
||||
bool unknown = false;
|
||||
for (int role = 0; role < op_data->pg_size; role++)
|
||||
for (int role = 0; role < op_data->stripe_count; role++)
|
||||
{
|
||||
if (role != best && votes[role] == votes[best])
|
||||
{
|
||||
|
@ -550,10 +538,11 @@ resume_2:
|
|||
}
|
||||
else
|
||||
{
|
||||
assert(op_data->scheme == POOL_SCHEME_EC || op_data->scheme == POOL_SCHEME_XOR);
|
||||
assert(op_data->pg->scheme == POOL_SCHEME_EC || op_data->pg->scheme == POOL_SCHEME_XOR);
|
||||
auto good_subset = ec_find_good(
|
||||
op_data->stripes, op_data->pg_size, op_data->pg_data_size, op_data->scheme == POOL_SCHEME_XOR,
|
||||
bs_block_size, clean_entry_bitmap_size, scrub_ec_max_bruteforce
|
||||
op_data->stripes, op_data->stripe_count,
|
||||
op_data->pg->pg_size, op_data->pg->pg_data_size, op_data->pg->scheme == POOL_SCHEME_XOR,
|
||||
bs_block_size, clean_entry_bitmap_size, scrub_ec_max_bruteforce, scrub_find_best
|
||||
);
|
||||
if (!good_subset.size())
|
||||
{
|
||||
|
@ -567,62 +556,115 @@ resume_2:
|
|||
else
|
||||
{
|
||||
int total = 0;
|
||||
for (int role = 0; role < op_data->pg_size; role++)
|
||||
for (int i = 0; i < op_data->stripe_count; i++)
|
||||
{
|
||||
if (!op_data->stripes[role].missing)
|
||||
if (!op_data->stripes[i].not_exists)
|
||||
{
|
||||
// use "missing" flag to distinguish actual read errors and inconsistent chunks
|
||||
total++;
|
||||
op_data->stripes[role].read_error = true;
|
||||
op_data->stripes[i].missing = true;
|
||||
}
|
||||
}
|
||||
for (int role: good_subset)
|
||||
for (int i: good_subset)
|
||||
{
|
||||
op_data->stripes[role].read_error = false;
|
||||
op_data->stripes[i].missing = false;
|
||||
}
|
||||
for (int role = 0; role < op_data->pg_size; role++)
|
||||
for (int i = 0; i < op_data->stripe_count; i++)
|
||||
{
|
||||
if (!op_data->stripes[role].missing && op_data->stripes[role].read_error)
|
||||
if (op_data->stripes[i].missing)
|
||||
{
|
||||
op_data->stripes[i].read_error = true;
|
||||
printf(
|
||||
"[PG %u/%u] Object %jx:%jx v%ju chunk %d on OSD %ju doesn't match other chunks%s\n",
|
||||
INODE_POOL(op_data->oid.inode), op_data->pg_num,
|
||||
op_data->oid.inode, op_data->oid.stripe, op_data->fact_ver,
|
||||
role, op_data->stripes[role].osd_num,
|
||||
op_data->stripes[i].role, op_data->stripes[i].osd_num,
|
||||
scrub_find_best ? ", marking it as corrupted" : ""
|
||||
);
|
||||
}
|
||||
}
|
||||
if (!scrub_find_best && good_subset.size() < total)
|
||||
}
|
||||
}
|
||||
bool mark = inconsistent;
|
||||
for (int role = 0; !mark && role < op_data->stripe_count; role++)
|
||||
{
|
||||
if (op_data->stripes[role].read_error || op_data->stripes[role].not_exists)
|
||||
mark = true;
|
||||
}
|
||||
if (!mark)
|
||||
{
|
||||
return;
|
||||
}
|
||||
// FIXME: ref = true ideally... because new_state != state is not necessarily true if it's freed and recreated
|
||||
op_data->object_state = mark_object(*op_data->pg, op_data->oid, op_data->object_state, false /*ref*/, [op_data, inconsistent](pg_osd_set_t & new_set)
|
||||
{
|
||||
// Mark object chunk(s) as corrupted and/or missing and/or inconsistent
|
||||
int changes = 0;
|
||||
for (int i = 0; i < op_data->stripe_count; i++)
|
||||
{
|
||||
// Find the same stripe in new_set
|
||||
int set_pos = 0;
|
||||
while (set_pos < new_set.size() && (op_data->stripes[i].osd_num != new_set[set_pos].osd_num ||
|
||||
op_data->stripes[i].role != new_set[set_pos].role))
|
||||
{
|
||||
inconsistent = true;
|
||||
printf(
|
||||
"[PG %u/%u] Object %jx:%jx v%ju is marked as inconsistent because scrub_find_best is turned off. Use vitastor-cli fix to fix it\n",
|
||||
INODE_POOL(op_data->oid.inode), op_data->pg_num,
|
||||
op_data->oid.inode, op_data->oid.stripe, op_data->fact_ver
|
||||
);
|
||||
for (int role = 0; role < op_data->pg_size; role++)
|
||||
{
|
||||
if (!op_data->stripes[role].missing && op_data->stripes[role].read_error)
|
||||
{
|
||||
// Undo error locator marking chunk as bad
|
||||
op_data->stripes[role].read_error = false;
|
||||
}
|
||||
}
|
||||
set_pos++;
|
||||
}
|
||||
if (set_pos >= new_set.size())
|
||||
{
|
||||
continue;
|
||||
}
|
||||
if (op_data->stripes[i].not_exists)
|
||||
{
|
||||
changes++;
|
||||
new_set.erase(new_set.begin()+set_pos, new_set.begin()+set_pos+1);
|
||||
continue;
|
||||
}
|
||||
auto & chunk = new_set[set_pos];
|
||||
if (op_data->stripes[i].read_error && chunk.loc_bad != LOC_CORRUPTED)
|
||||
{
|
||||
changes++;
|
||||
chunk.loc_bad = LOC_CORRUPTED;
|
||||
}
|
||||
else if (op_data->stripes[i].read_end > 0 && !op_data->stripes[chunk.role].missing &&
|
||||
(chunk.loc_bad & LOC_CORRUPTED))
|
||||
{
|
||||
changes++;
|
||||
chunk.loc_bad &= ~LOC_CORRUPTED;
|
||||
}
|
||||
if (inconsistent && !(chunk.loc_bad & LOC_INCONSISTENT))
|
||||
{
|
||||
changes++;
|
||||
chunk.loc_bad |= LOC_INCONSISTENT;
|
||||
}
|
||||
else if (!inconsistent && (chunk.loc_bad & LOC_INCONSISTENT))
|
||||
{
|
||||
changes++;
|
||||
chunk.loc_bad &= ~LOC_INCONSISTENT;
|
||||
}
|
||||
}
|
||||
}
|
||||
for (int role = 0; role < op_data->pg_size; role++)
|
||||
return changes;
|
||||
});
|
||||
}
|
||||
|
||||
void osd_t::continue_primary_scrub(osd_op_t *cur_op)
|
||||
{
|
||||
if (!cur_op->op_data && !prepare_primary_rw(cur_op))
|
||||
return;
|
||||
if (cur_op->op_data->st == 1)
|
||||
goto resume_1;
|
||||
else if (cur_op->op_data->st == 2)
|
||||
goto resume_2;
|
||||
submit_scrub_subops(cur_op);
|
||||
resume_1:
|
||||
return;
|
||||
resume_2:
|
||||
if (cur_op->op_data->errors > 0 &&
|
||||
// I/O and checksum errors (represented by stripes[i].read_error) are OK
|
||||
(cur_op->op_data->errcode != -EIO && cur_op->op_data->errcode != -EDOM))
|
||||
{
|
||||
if (op_data->stripes[role].osd_num != 0 &&
|
||||
(op_data->stripes[role].read_error || op_data->stripes[role].not_exists) ||
|
||||
inconsistent)
|
||||
{
|
||||
// Got at least 1 read error or mismatch, mark the object as corrupted
|
||||
auto & pg = pgs.at({ .pool_id = INODE_POOL(op_data->oid.inode), .pg_num = op_data->pg_num });
|
||||
// FIXME: ref = true ideally... because new_state != state is not necessarily true if it's freed and recreated
|
||||
op_data->object_state = mark_object_corrupted(pg, op_data->oid, op_data->object_state, op_data->stripes, false, inconsistent);
|
||||
break;
|
||||
}
|
||||
finish_op(cur_op, cur_op->op_data->errcode);
|
||||
return;
|
||||
}
|
||||
scrub_check_results(cur_op);
|
||||
finish_op(cur_op, 0);
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue