Implement fool protection for FS pools

master
Vitaliy Filippov 2024-03-10 18:08:57 +03:00
parent dcbe1afac3
commit 6783d4a13c
15 changed files with 116 additions and 26 deletions

View File

@ -41,6 +41,7 @@ Parameters:
- [osd_tags](#osd_tags)
- [primary_affinity_tags](#primary_affinity_tags)
- [scrub_interval](#scrub_interval)
- [used_for_fs](#used_for_fs)
Examples:
@ -299,6 +300,25 @@ of the OSDs containing a data chunk for a PG.
Automatic scrubbing interval for this pool. Overrides
[global scrub_interval setting](osd.en.md#scrub_interval).
## used_for_fs
- Type: string
If non-empty, the pool is marked as used for VitastorFS with metadata stored
in block image (regular Vitastor volume) named as the value of this pool parameter.
When a pool is marked as used for VitastorFS, regular block volume creation in it
is disabled (vitastor-cli refuses to create images without --force) to protect
the user from block volume and FS file ID collisions and data loss.
[vitastor-nfs](../usage/nfs.ru.md), in its turn, refuses to use pools not marked
for the corresponding FS when starting. This also implies that you can use one
pool only for one VitastorFS.
The second thing that is disabled for VitastorFS pools is reporting per-inode space
usage statistics in etcd because a FS pool may store a very large number of files
and statistics for them all would take a lot of space in etcd.
# Examples
## Replicated pool

View File

@ -40,6 +40,7 @@
- [osd_tags](#osd_tags)
- [primary_affinity_tags](#primary_affinity_tags)
- [scrub_interval](#scrub_interval)
- [used_for_fs](#used_for_fs)
Примеры:
@ -306,6 +307,27 @@ OSD с "all".
Интервал скраба, то есть, автоматической фоновой проверки данных для данного пула.
Переопределяет [глобальную настройку scrub_interval](osd.ru.md#scrub_interval).
## used_for_fs
- Type: string
Если непусто, пул помечается как используемый для файловой системы VitastorFS с
метаданными, хранимыми в блочном образе Vitastor с именем, равным значению
этого параметра.
Когда пул помечается как используемый для VitastorFS, создание обычных блочных
образов в нём отключается (vitastor-cli отказывается создавать образы без --force),
чтобы защитить пользователя от коллизий ID файлов и блочных образов и, таким
образом, от потери данных.
[vitastor-nfs](../usage/nfs.ru.md), в свою очередь, при запуске отказывается
использовать для ФС пулы, не выделенные для неё. Это также означает, что один
пул может использоваться только для одной VitastorFS.
Также для ФС-пулов отключается передача статистики в etcd по отдельным инодам,
так как ФС-пул может содержать очень много файлов и статистика по ним всем
заняла бы очень много места в etcd.
# Примеры
## Реплицированный пул

View File

@ -267,7 +267,7 @@ Optional parameters:
| `--immediate_commit none` | Put pool only on OSDs with this or larger immediate_commit (none < small < all) |
| `--primary_affinity_tags tags` | Prefer to put primary copies on OSDs with all specified tags |
| `--scrub_interval <time>` | Enable regular scrubbing for this pool. Format: number + unit s/m/h/d/M/y |
| `--no_inode_stats 1` | Disable per-inode statistics for this pool (use for VitastorFS pools) |
| `--used_for_fs <name>` | Mark pool as used for VitastorFS with metadata in image <name> |
| `--pg_stripe_size <number>` | Increase object grouping stripe |
| `--max_osd_combinations 10000` | Maximum number of random combinations for LP solver input |
| `--wait` | Wait for the new pool to come online |

View File

@ -131,7 +131,7 @@ static const char* help_text =
" --immediate_commit none Put pool only on OSDs with this or larger immediate_commit (none < small < all)\n"
" --primary_affinity_tags tags Prefer to put primary copies on OSDs with all specified tags\n"
" --scrub_interval <time> Enable regular scrubbing for this pool. Format: number + unit s/m/h/d/M/y\n"
" --no_inode_stats 1 Disable per-inode statistics for this pool (use for VitastorFS pools)\n"
" --used_for_fs <name> Mark pool as used for VitastorFS with metadata in image <name>\n"
" --pg_stripe_size <number> Increase object grouping stripe\n"
" --max_osd_combinations 10000 Maximum number of random combinations for LP solver input\n"
" --wait Wait for the new pool to come online\n"
@ -143,7 +143,7 @@ static const char* help_text =
"vitastor-cli modify-pool|pool-modify <id|name> [--name <new_name>] [PARAMETERS...]\n"
" Modify an existing pool. Modifiable parameters:\n"
" [-s|--pg_size <number>] [--pg_minsize <number>] [-n|--pg_count <count>]\n"
" [--failure_domain <level>] [--root_node <node>] [--osd_tags <tags>] [--no_inode_stats 0|1]\n"
" [--failure_domain <level>] [--root_node <node>] [--osd_tags <tags>] [--used_for_fs <name>]\n"
" [--max_osd_combinations <number>] [--primary_affinity_tags <tags>] [--scrub_interval <time>]\n"
" Non-modifiable parameters (changing them WILL lead to data loss):\n"
" [--block_size <size>] [--bitmap_granularity <size>]\n"
@ -186,7 +186,6 @@ static json11::Json::object parse_args(int narg, const char *args[])
for (int i = 1; i < narg; i++)
{
bool argHasValue = (!(i == narg-1) && (args[i+1][0] != '-'));
if (args[i][0] == '-' && args[i][1] == 'h' && args[i][2] == 0)
{
cfg["help"] = "1";

View File

@ -27,6 +27,7 @@ struct image_creator_t
std::string image_name, new_snap, new_parent;
json11::Json new_meta;
uint64_t size;
bool force = false;
bool force_size = false;
pool_id_t old_pool_id = 0;
@ -45,6 +46,7 @@ struct image_creator_t
void loop()
{
auto & pools = parent->cli->st_cli.pool_config;
if (state >= 1)
goto resume_1;
if (image_name == "")
@ -62,7 +64,6 @@ struct image_creator_t
}
if (new_pool_id)
{
auto & pools = parent->cli->st_cli.pool_config;
if (pools.find(new_pool_id) == pools.end())
{
result = (cli_result_t){ .err = ENOENT, .text = "Pool "+std::to_string(new_pool_id)+" does not exist" };
@ -72,7 +73,7 @@ struct image_creator_t
}
else if (new_pool_name != "")
{
for (auto & ic: parent->cli->st_cli.pool_config)
for (auto & ic: pools)
{
if (ic.second.name == new_pool_name)
{
@ -87,10 +88,20 @@ struct image_creator_t
return;
}
}
else if (parent->cli->st_cli.pool_config.size() == 1)
else if (pools.size() == 1)
{
auto it = parent->cli->st_cli.pool_config.begin();
new_pool_id = it->first;
new_pool_id = pools.begin()->first;
}
if (new_pool_id && !pools.at(new_pool_id).used_for_fs.empty() && !force)
{
result = (cli_result_t){
.err = EINVAL,
.text = "Pool "+pools.at(new_pool_id).name+
" is used for VitastorFS "+pools.at(new_pool_id).used_for_fs+
". Use --force if you really know what you are doing",
};
state = 100;
return;
}
state = 1;
resume_1:
@ -532,6 +543,7 @@ std::function<bool(cli_result_t &)> cli_tool_t::start_create(json11::Json cfg)
image_creator->image_name = cfg["image"].string_value();
image_creator->new_pool_id = cfg["pool"].uint64_value();
image_creator->new_pool_name = cfg["pool"].string_value();
image_creator->force = cfg["force"].bool_value();
image_creator->force_size = cfg["force_size"].bool_value();
if (cfg["image_meta"].is_object())
{

View File

@ -81,13 +81,8 @@ std::string validate_pool_config(json11::Json::object & new_cfg, json11::Json ol
}
value = value.uint64_value();
}
else if (key == "no_inode_stats" && value.bool_value())
{
// Leave true, remove false
value = true;
}
else if (key == "name" || key == "scheme" || key == "immediate_commit" ||
key == "failure_domain" || key == "root_node" || key == "scrub_interval")
key == "failure_domain" || key == "root_node" || key == "scrub_interval" || key == "used_for_fs")
{
// OK
}
@ -124,6 +119,10 @@ std::string validate_pool_config(json11::Json::object & new_cfg, json11::Json ol
{
new_cfg.erase("parity_chunks");
}
if (new_cfg.find("used_for_fs") != new_cfg.end() && new_cfg["used_for_fs"].string_value() == "")
{
new_cfg.erase("used_for_fs");
}
// Prevent autovivification of object keys. Now we don't modify the config, we just check it
json11::Json cfg = new_cfg;

View File

@ -529,8 +529,6 @@ resume_3:
st["block_size_fmt"] = format_size(st["block_size"].uint64_value());
if (st["bitmap_granularity"].uint64_value())
st["bitmap_granularity_fmt"] = format_size(st["bitmap_granularity"].uint64_value());
if (st["no_inode_stats"].bool_value())
st["inode_stats_fmt"] = "disabled";
}
// All pool parameters are only displayed in the "detailed" mode
// because there's too many of them to show them in table
@ -538,6 +536,7 @@ resume_3:
{ "name", "Name" },
{ "id", "ID" },
{ "scheme_name", "Scheme" },
{ "used_for_fs", "Used for VitastorFS" },
{ "status", "Status" },
{ "pg_count_fmt", "PGs" },
{ "pg_minsize", "PG minsize" },

View File

@ -112,6 +112,24 @@ resume_1:
return;
}
if (new_cfg.find("used_for_fs") != new_cfg.end() && !force)
{
// Check that pool doesn't have images
auto img_it = parent->cli->st_cli.inode_config.lower_bound(INODE_WITH_POOL(pool_id, 0));
if (img_it != parent->cli->st_cli.inode_config.end() && INODE_POOL(img_it->first) == pool_id &&
img_it->second.name == new_cfg["used_for_fs"].string_value())
{
// Only allow metadata image to exist in the FS pool
img_it++;
}
if (img_it != parent->cli->st_cli.inode_config.end() && INODE_POOL(img_it->first) == pool_id)
{
result = (cli_result_t){ .err = ENOENT, .text = "Pool "+pool_name+" has block images, delete them before using it for VitastorFS" };
state = 100;
return;
}
}
// Update pool
auto pls = kv.value.object_items();
pls[std::to_string(pool_id)] = new_cfg;

View File

@ -863,8 +863,8 @@ void etcd_state_client_t::parse_state(const etcd_kv_t & kv)
pc.scrub_interval = parse_time(pool_item.second["scrub_interval"].string_value());
if (!pc.scrub_interval)
pc.scrub_interval = 0;
// Disable per-inode stats
pc.no_inode_stats = pool_item.second["no_inode_stats"].bool_value();
// Mark pool as VitastorFS pool (disable per-inode stats and block volume creation)
pc.used_for_fs = pool_item.second["used_for_fs"].as_string();
// Immediate Commit Mode
pc.immediate_commit = pool_item.second["immediate_commit"].is_string()
? parse_immediate_commit(pool_item.second["immediate_commit"].string_value())

View File

@ -60,7 +60,7 @@ struct pool_config_t
uint64_t pg_stripe_size;
std::map<pg_num_t, pg_config_t> pg_config;
uint64_t scrub_interval;
bool no_inode_stats;
std::string used_for_fs;
};
struct inode_config_t

View File

@ -196,6 +196,7 @@ void nfs_kv_procs(nfs_client_t *self)
void kv_fs_state_t::init(nfs_proxy_t *proxy, json11::Json cfg)
{
this->proxy = proxy;
auto & pool_cfg = proxy->cli->st_cli.pool_config.at(proxy->default_pool_id);
fs_kv_inode = cfg["fs"].uint64_value();
if (fs_kv_inode)
{
@ -221,6 +222,25 @@ void kv_fs_state_t::init(nfs_proxy_t *proxy, json11::Json cfg)
exit(1);
}
}
if (proxy->cli->st_cli.inode_config.find(fs_kv_inode) != proxy->cli->st_cli.inode_config.end())
{
auto & name = proxy->cli->st_cli.inode_config.at(fs_kv_inode).name;
if (pool_cfg.used_for_fs != name)
{
fprintf(stderr, "Please mark pool as used for this file system with `vitastor-cli modify-pool --used-for-fs %s %s`\n",
name.c_str(), cfg["fs"].string_value().c_str());
exit(1);
}
}
auto img_it = proxy->cli->st_cli.inode_config.lower_bound(INODE_WITH_POOL(proxy->default_pool_id+1, 0));
if (img_it != proxy->cli->st_cli.inode_config.begin())
{
img_it--;
if (img_it != proxy->cli->st_cli.inode_config.begin() && INODE_POOL(img_it->first) == proxy->default_pool_id)
{
idgen[proxy->default_pool_id].min_id = INODE_NO_POOL(img_it->first) + 1;
}
}
readdir_getattr_parallel = cfg["readdir_getattr_parallel"].uint64_value();
if (!readdir_getattr_parallel)
readdir_getattr_parallel = 8;
@ -230,7 +250,6 @@ void kv_fs_state_t::init(nfs_proxy_t *proxy, json11::Json cfg)
touch_interval = cfg["touch_interval"].uint64_value();
if (touch_interval < 100) // ms
touch_interval = 100;
auto & pool_cfg = proxy->cli->st_cli.pool_config.at(proxy->default_pool_id);
pool_block_size = pool_cfg.pg_stripe_size;
pool_alignment = pool_cfg.bitmap_granularity;
// Open DB and wait

View File

@ -44,6 +44,7 @@ struct kv_inode_extend_t
struct kv_idgen_t
{
uint64_t next_id = 1, allocated_id = 0;
uint64_t min_id = 1;
std::vector<uint64_t> unallocated_ids;
};

View File

@ -25,8 +25,7 @@ void allocate_new_id(nfs_client_t *self, pool_id_t pool_id, std::function<void(i
cb(0, INODE_WITH_POOL(pool_id, idgen.next_id-1));
return;
}
// FIXME: Partial per-pool max ID limits
// FIXME: Fool protection from block volume and FS file ID overlap
// FIXME: Maybe allow FS and block volumes to cohabitate in the same pool, but with different ID ranges
else if (idgen.next_id >= ((uint64_t)1 << (64-POOL_ID_BITS)))
{
cb(-ENOSPC, 0);
@ -34,6 +33,7 @@ void allocate_new_id(nfs_client_t *self, pool_id_t pool_id, std::function<void(i
}
self->parent->db->get((pool_id ? "id"+std::to_string(pool_id) : "id"), [=](int res, const std::string & prev_str)
{
auto & idgen = self->parent->kvfs->idgen[pool_id];
if (res < 0 && res != -ENOENT)
{
cb(res, 0);
@ -45,9 +45,9 @@ void allocate_new_id(nfs_client_t *self, pool_id_t pool_id, std::function<void(i
cb(-ENOSPC, 0);
return;
}
if (prev_val < 1)
if (prev_val < idgen.min_id)
{
prev_val = 1;
prev_val = idgen.min_id;
}
uint64_t new_val = prev_val + self->parent->kvfs->id_alloc_batch_size;
if (new_val >= self->parent->kvfs->fs_inode_count)

View File

@ -631,7 +631,7 @@ void osd_t::apply_no_inode_stats()
std::vector<uint64_t> no_inode_stats;
for (auto & pool_item: st_cli.pool_config)
{
if (pool_item.second.no_inode_stats)
if (!pool_item.second.used_for_fs.empty())
{
no_inode_stats.push_back(pool_item.first);
}

View File

@ -4,6 +4,7 @@ PG_COUNT=16
. `dirname $0`/run_3osds.sh
build/src/vitastor-cli --etcd_address $ETCD_URL create -s 10G fsmeta
build/src/vitastor-cli --etcd_address $ETCD_URL modify-pool --used-for-fs fsmeta testpool
build/src/vitastor-nfs start --fs fsmeta --etcd_address $ETCD_URL --portmap 0 --port 2050 --foreground 1 --trace 1 >>./testdata/nfs.log 2>&1 &
NFS_PID=$!