2020-09-17 23:02:40 +03:00
|
|
|
// Copyright (c) Vitaliy Filippov, 2019+
|
2021-02-06 01:26:07 +03:00
|
|
|
// License: VNPL-1.1 (see README.md for details)
|
2020-09-17 23:02:40 +03:00
|
|
|
|
2020-05-03 11:04:20 +03:00
|
|
|
#include "osd_primary.h"
|
2021-01-12 01:02:56 +03:00
|
|
|
#include "allocator.h"
|
2020-02-24 02:40:48 +03:00
|
|
|
|
2020-02-03 12:35:02 +03:00
|
|
|
// read: read directly or read paired stripe(s), reconstruct, return
|
2020-04-05 15:50:42 +03:00
|
|
|
// write: read paired stripe(s), reconstruct, modify, calculate parity, write
|
2020-02-09 18:57:45 +03:00
|
|
|
//
|
2020-02-03 12:35:02 +03:00
|
|
|
// nuance: take care to read the same version from paired stripes!
|
2020-02-09 18:57:45 +03:00
|
|
|
// to do so, we remember "last readable" version until a write request completes
|
|
|
|
// and we postpone other write requests to the same stripe until completion of previous ones
|
2020-02-03 12:35:02 +03:00
|
|
|
//
|
2020-04-05 15:50:42 +03:00
|
|
|
// sync: sync peers, get unstable versions, stabilize them
|
2020-02-03 12:35:02 +03:00
|
|
|
|
2020-02-23 02:11:43 +03:00
|
|
|
bool osd_t::prepare_primary_rw(osd_op_t *cur_op)
|
2020-01-28 22:40:50 +03:00
|
|
|
{
|
2020-02-13 19:13:16 +03:00
|
|
|
// PG number is calculated from the offset
|
|
|
|
// Our EC scheme stores data in fixed chunks equal to (K*block size)
|
2020-11-30 00:08:25 +03:00
|
|
|
// K = (pg_size-parity_chunks) in case of EC/XOR, or 1 for replicated pools
|
2020-09-04 22:17:44 +03:00
|
|
|
pool_id_t pool_id = INODE_POOL(cur_op->req.rw.inode);
|
2021-03-21 15:02:24 +03:00
|
|
|
// Note: We read pool config here, so we must NOT change it when PGs are active
|
2020-10-30 01:02:32 +03:00
|
|
|
auto pool_cfg_it = st_cli.pool_config.find(pool_id);
|
|
|
|
if (pool_cfg_it == st_cli.pool_config.end())
|
|
|
|
{
|
|
|
|
// Pool config is not loaded yet
|
|
|
|
finish_op(cur_op, -EPIPE);
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
auto & pool_cfg = pool_cfg_it->second;
|
2021-03-23 00:26:22 +03:00
|
|
|
// FIXME: op_data->pg_data_size can probably be removed (there's pg.pg_data_size)
|
2020-11-30 00:08:25 +03:00
|
|
|
uint64_t pg_data_size = (pool_cfg.scheme == POOL_SCHEME_REPLICATED ? 1 : pool_cfg.pg_size-pool_cfg.parity_chunks);
|
|
|
|
uint64_t pg_block_size = bs_block_size * pg_data_size;
|
2020-06-06 01:39:58 +03:00
|
|
|
object_id oid = {
|
|
|
|
.inode = cur_op->req.rw.inode,
|
|
|
|
// oid.stripe = starting offset of the parity stripe
|
|
|
|
.stripe = (cur_op->req.rw.offset/pg_block_size)*pg_block_size,
|
|
|
|
};
|
2021-02-13 19:27:25 +03:00
|
|
|
pg_num_t pg_num = (oid.stripe/pool_cfg.pg_stripe_size) % pg_counts[pool_id] + 1; // like map_to_pg()
|
2020-09-04 10:54:21 +03:00
|
|
|
auto pg_it = pgs.find({ .pool_id = pool_id, .pg_num = pg_num });
|
2020-04-03 13:03:42 +03:00
|
|
|
if (pg_it == pgs.end() || !(pg_it->second.state & PG_ACTIVE))
|
2020-02-13 19:13:16 +03:00
|
|
|
{
|
2020-04-03 13:03:42 +03:00
|
|
|
// This OSD is not primary for this PG or the PG is inactive
|
2020-11-30 00:08:25 +03:00
|
|
|
// FIXME: Allow reads from PGs degraded under pg_minsize, but don't allow writes
|
2020-04-02 22:16:46 +03:00
|
|
|
finish_op(cur_op, -EPIPE);
|
2020-02-23 02:11:43 +03:00
|
|
|
return false;
|
2020-02-13 19:13:16 +03:00
|
|
|
}
|
2020-03-31 02:09:25 +03:00
|
|
|
if ((cur_op->req.rw.offset + cur_op->req.rw.len) > (oid.stripe + pg_block_size) ||
|
2021-01-10 11:57:41 +03:00
|
|
|
(cur_op->req.rw.offset % bs_bitmap_granularity) != 0 ||
|
|
|
|
(cur_op->req.rw.len % bs_bitmap_granularity) != 0)
|
2020-01-30 22:06:46 +03:00
|
|
|
{
|
2020-03-24 00:18:35 +03:00
|
|
|
finish_op(cur_op, -EINVAL);
|
2020-02-23 02:11:43 +03:00
|
|
|
return false;
|
2020-01-30 22:06:46 +03:00
|
|
|
}
|
2023-02-13 02:59:38 +03:00
|
|
|
// Scrub is similar to r/w, so it's also handled here
|
|
|
|
int stripe_count = (pool_cfg.scheme == POOL_SCHEME_REPLICATED
|
|
|
|
&& cur_op->req.hdr.opcode != OSD_OP_SCRUB ? 1 : pg_it->second.pg_size);
|
2021-03-23 00:26:22 +03:00
|
|
|
int chain_size = 0;
|
|
|
|
if (cur_op->req.hdr.opcode == OSD_OP_READ && cur_op->req.rw.meta_revision > 0)
|
|
|
|
{
|
|
|
|
// Chained read
|
|
|
|
auto inode_it = st_cli.inode_config.find(cur_op->req.rw.inode);
|
|
|
|
if (inode_it->second.mod_revision != cur_op->req.rw.meta_revision)
|
|
|
|
{
|
|
|
|
// Client view of the metadata differs from OSD's view
|
|
|
|
// Operation can't be completed correctly, client should retry later
|
|
|
|
finish_op(cur_op, -EPIPE);
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
// Find parents from the same pool. Optimized reads only work within pools
|
|
|
|
while (inode_it != st_cli.inode_config.end() && inode_it->second.parent_id &&
|
2021-06-20 00:23:03 +03:00
|
|
|
INODE_POOL(inode_it->second.parent_id) == pg_it->second.pool_id &&
|
|
|
|
// Check for loops
|
|
|
|
inode_it->second.parent_id != cur_op->req.rw.inode)
|
2021-03-23 00:26:22 +03:00
|
|
|
{
|
|
|
|
chain_size++;
|
|
|
|
inode_it = st_cli.inode_config.find(inode_it->second.parent_id);
|
|
|
|
}
|
|
|
|
if (chain_size)
|
|
|
|
{
|
|
|
|
// Add the original inode
|
|
|
|
chain_size++;
|
|
|
|
}
|
|
|
|
}
|
2020-09-05 01:09:10 +03:00
|
|
|
osd_primary_op_data_t *op_data = (osd_primary_op_data_t*)calloc_or_die(
|
2021-03-23 00:26:22 +03:00
|
|
|
// Allocate:
|
|
|
|
// - op_data
|
|
|
|
1, sizeof(osd_primary_op_data_t) +
|
|
|
|
// - stripes
|
2023-06-30 02:49:11 +03:00
|
|
|
stripe_count * sizeof(osd_rmw_stripe_t) +
|
2021-03-23 00:26:22 +03:00
|
|
|
chain_size * (
|
|
|
|
// - copy of the chain
|
|
|
|
sizeof(inode_t) +
|
2023-02-27 02:12:55 +03:00
|
|
|
// - object states for every chain item
|
|
|
|
sizeof(void*) +
|
2021-03-23 00:26:22 +03:00
|
|
|
// - bitmap buffers for chained read
|
|
|
|
stripe_count * clean_entry_bitmap_size +
|
|
|
|
// - 'missing' flags for chained reads
|
|
|
|
(pool_cfg.scheme == POOL_SCHEME_REPLICATED ? 0 : pg_it->second.pg_size)
|
|
|
|
)
|
2020-02-03 12:35:02 +03:00
|
|
|
);
|
2022-01-15 23:55:10 +03:00
|
|
|
void *data_buf = (uint8_t*)op_data + sizeof(osd_primary_op_data_t);
|
2020-02-23 02:11:43 +03:00
|
|
|
op_data->pg_num = pg_num;
|
2020-02-09 18:22:29 +03:00
|
|
|
op_data->oid = oid;
|
2021-03-23 00:26:22 +03:00
|
|
|
op_data->stripes = (osd_rmw_stripe_t*)data_buf;
|
2022-01-15 23:55:10 +03:00
|
|
|
data_buf = (uint8_t*)data_buf + sizeof(osd_rmw_stripe_t) * stripe_count;
|
2020-09-04 22:17:44 +03:00
|
|
|
op_data->scheme = pool_cfg.scheme;
|
2020-11-30 00:08:25 +03:00
|
|
|
op_data->pg_data_size = pg_data_size;
|
2021-03-23 00:26:22 +03:00
|
|
|
op_data->pg_size = pg_it->second.pg_size;
|
2020-02-03 12:35:02 +03:00
|
|
|
cur_op->op_data = op_data;
|
2020-11-30 00:08:25 +03:00
|
|
|
split_stripes(pg_data_size, bs_block_size, (uint32_t)(cur_op->req.rw.offset - oid.stripe), cur_op->req.rw.len, op_data->stripes);
|
2023-06-30 02:49:11 +03:00
|
|
|
// Resulting bitmaps have to survive op_data and be freed with the op itself
|
|
|
|
assert(!cur_op->bitmap_buf);
|
|
|
|
cur_op->bitmap_buf = calloc_or_die(1, clean_entry_bitmap_size * stripe_count);
|
2021-01-12 01:09:59 +03:00
|
|
|
for (int i = 0; i < stripe_count; i++)
|
|
|
|
{
|
2023-06-30 02:49:11 +03:00
|
|
|
op_data->stripes[i].bmp_buf = (uint8_t*)cur_op->bitmap_buf + clean_entry_bitmap_size * i;
|
2021-03-23 00:26:22 +03:00
|
|
|
}
|
|
|
|
op_data->chain_size = chain_size;
|
|
|
|
if (chain_size > 0)
|
|
|
|
{
|
|
|
|
op_data->read_chain = (inode_t*)data_buf;
|
2022-01-15 23:55:10 +03:00
|
|
|
data_buf = (uint8_t*)data_buf + sizeof(inode_t) * chain_size;
|
2023-02-27 02:12:55 +03:00
|
|
|
op_data->chain_states = (pg_osd_set_state_t**)data_buf;
|
|
|
|
data_buf = (uint8_t*)data_buf + sizeof(pg_osd_set_state_t*) * chain_size;
|
2021-03-23 00:26:22 +03:00
|
|
|
op_data->snapshot_bitmaps = data_buf;
|
2022-01-15 23:55:10 +03:00
|
|
|
data_buf = (uint8_t*)data_buf + chain_size * stripe_count * clean_entry_bitmap_size;
|
2021-03-23 00:26:22 +03:00
|
|
|
op_data->missing_flags = (uint8_t*)data_buf;
|
2022-01-15 23:55:10 +03:00
|
|
|
data_buf = (uint8_t*)data_buf + chain_size * (pool_cfg.scheme == POOL_SCHEME_REPLICATED ? 0 : pg_it->second.pg_size);
|
2021-03-23 00:26:22 +03:00
|
|
|
// Copy chain
|
|
|
|
int chain_num = 0;
|
2023-06-28 02:22:58 +03:00
|
|
|
op_data->read_chain[chain_num] = cur_op->req.rw.inode;
|
|
|
|
op_data->chain_states[chain_num] = NULL;
|
|
|
|
chain_num++;
|
2021-03-23 00:26:22 +03:00
|
|
|
auto inode_it = st_cli.inode_config.find(cur_op->req.rw.inode);
|
2021-06-20 00:23:03 +03:00
|
|
|
while (inode_it != st_cli.inode_config.end() && inode_it->second.parent_id &&
|
|
|
|
INODE_POOL(inode_it->second.parent_id) == pg_it->second.pool_id &&
|
|
|
|
// Check for loops
|
|
|
|
inode_it->second.parent_id != cur_op->req.rw.inode)
|
2021-03-23 00:26:22 +03:00
|
|
|
{
|
2023-06-28 02:22:58 +03:00
|
|
|
op_data->read_chain[chain_num] = inode_it->second.parent_id;
|
|
|
|
op_data->chain_states[chain_num] = NULL;
|
2021-03-23 00:26:22 +03:00
|
|
|
inode_it = st_cli.inode_config.find(inode_it->second.parent_id);
|
2023-06-28 02:22:58 +03:00
|
|
|
chain_num++;
|
2021-03-23 00:26:22 +03:00
|
|
|
}
|
2021-01-12 01:09:59 +03:00
|
|
|
}
|
2020-04-03 13:03:42 +03:00
|
|
|
pg_it->second.inflight++;
|
2020-02-23 02:11:43 +03:00
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2023-02-27 02:12:55 +03:00
|
|
|
uint64_t* osd_t::get_object_osd_set(pg_t &pg, object_id &oid, pg_osd_set_state_t **object_state)
|
2020-03-24 01:13:04 +03:00
|
|
|
{
|
|
|
|
if (!(pg.state & (PG_HAS_INCOMPLETE | PG_HAS_DEGRADED | PG_HAS_MISPLACED)))
|
|
|
|
{
|
2020-04-05 15:50:42 +03:00
|
|
|
*object_state = NULL;
|
2023-02-27 02:12:55 +03:00
|
|
|
return pg.cur_set.data();
|
2020-03-24 01:13:04 +03:00
|
|
|
}
|
|
|
|
auto st_it = pg.incomplete_objects.find(oid);
|
|
|
|
if (st_it != pg.incomplete_objects.end())
|
|
|
|
{
|
2020-04-05 15:50:42 +03:00
|
|
|
*object_state = st_it->second;
|
2020-03-24 01:13:04 +03:00
|
|
|
return st_it->second->read_target.data();
|
|
|
|
}
|
|
|
|
st_it = pg.degraded_objects.find(oid);
|
|
|
|
if (st_it != pg.degraded_objects.end())
|
|
|
|
{
|
2020-04-05 15:50:42 +03:00
|
|
|
*object_state = st_it->second;
|
2020-03-24 01:13:04 +03:00
|
|
|
return st_it->second->read_target.data();
|
|
|
|
}
|
|
|
|
st_it = pg.misplaced_objects.find(oid);
|
|
|
|
if (st_it != pg.misplaced_objects.end())
|
|
|
|
{
|
2020-04-05 15:50:42 +03:00
|
|
|
*object_state = st_it->second;
|
2020-03-24 01:13:04 +03:00
|
|
|
return st_it->second->read_target.data();
|
|
|
|
}
|
2020-04-05 15:50:42 +03:00
|
|
|
*object_state = NULL;
|
2023-02-27 02:12:55 +03:00
|
|
|
return pg.cur_set.data();
|
2020-03-24 01:13:04 +03:00
|
|
|
}
|
|
|
|
|
2020-02-24 02:40:48 +03:00
|
|
|
void osd_t::continue_primary_read(osd_op_t *cur_op)
|
2020-02-23 02:11:43 +03:00
|
|
|
{
|
2020-02-24 02:40:48 +03:00
|
|
|
if (!cur_op->op_data && !prepare_primary_rw(cur_op))
|
2020-02-03 12:35:02 +03:00
|
|
|
{
|
2020-02-23 02:11:43 +03:00
|
|
|
return;
|
2020-02-03 12:35:02 +03:00
|
|
|
}
|
2020-02-24 02:40:48 +03:00
|
|
|
osd_primary_op_data_t *op_data = cur_op->op_data;
|
2021-03-23 00:26:22 +03:00
|
|
|
if (op_data->chain_size)
|
|
|
|
{
|
|
|
|
continue_chained_read(cur_op);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
if (op_data->st == 1)
|
|
|
|
goto resume_1;
|
|
|
|
else if (op_data->st == 2)
|
|
|
|
goto resume_2;
|
2023-02-27 02:12:55 +03:00
|
|
|
resume_0:
|
2021-03-23 00:26:22 +03:00
|
|
|
cur_op->reply.rw.bitmap_len = 0;
|
2020-01-30 22:06:46 +03:00
|
|
|
{
|
2021-02-28 19:36:14 +03:00
|
|
|
auto & pg = pgs.at({ .pool_id = INODE_POOL(op_data->oid.inode), .pg_num = op_data->pg_num });
|
2023-05-07 14:00:02 +03:00
|
|
|
if (cur_op->req.rw.len == 0)
|
2020-02-03 12:35:02 +03:00
|
|
|
{
|
2023-05-07 14:00:02 +03:00
|
|
|
// len=0 => bitmap read
|
|
|
|
for (int role = 0; role < op_data->pg_data_size; role++)
|
|
|
|
{
|
|
|
|
op_data->stripes[role].read_start = 0;
|
|
|
|
op_data->stripes[role].read_end = UINT32_MAX;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
for (int role = 0; role < op_data->pg_data_size; role++)
|
|
|
|
{
|
|
|
|
op_data->stripes[role].read_start = op_data->stripes[role].req_start;
|
|
|
|
op_data->stripes[role].read_end = op_data->stripes[role].req_end;
|
|
|
|
}
|
2020-02-03 12:35:02 +03:00
|
|
|
}
|
2020-02-24 02:40:48 +03:00
|
|
|
// Determine version
|
|
|
|
auto vo_it = pg.ver_override.find(op_data->oid);
|
|
|
|
op_data->target_ver = vo_it != pg.ver_override.end() ? vo_it->second : UINT64_MAX;
|
2023-02-27 02:12:55 +03:00
|
|
|
// PG may have degraded or misplaced objects
|
|
|
|
op_data->prev_set = get_object_osd_set(pg, op_data->oid, &op_data->object_state);
|
2020-09-04 22:17:44 +03:00
|
|
|
if (pg.state == PG_ACTIVE || op_data->scheme == POOL_SCHEME_REPLICATED)
|
2020-01-30 22:06:46 +03:00
|
|
|
{
|
2020-02-24 02:40:48 +03:00
|
|
|
// Fast happy-path
|
2023-02-27 02:12:55 +03:00
|
|
|
if (op_data->scheme == POOL_SCHEME_REPLICATED &&
|
|
|
|
op_data->object_state && (op_data->object_state->state & OBJ_INCOMPLETE))
|
|
|
|
{
|
|
|
|
finish_op(cur_op, -EIO);
|
|
|
|
return;
|
|
|
|
}
|
2021-01-12 01:09:59 +03:00
|
|
|
cur_op->buf = alloc_read_buffer(op_data->stripes, op_data->pg_data_size, 0);
|
2022-04-08 11:50:09 +03:00
|
|
|
submit_primary_subops(SUBMIT_RMW_READ, op_data->target_ver, op_data->prev_set, cur_op);
|
2020-02-24 02:40:48 +03:00
|
|
|
op_data->st = 1;
|
2020-02-03 12:35:02 +03:00
|
|
|
}
|
2020-02-24 02:40:48 +03:00
|
|
|
else
|
2020-02-03 12:35:02 +03:00
|
|
|
{
|
2022-04-08 11:50:09 +03:00
|
|
|
if (extend_missing_stripes(op_data->stripes, op_data->prev_set, op_data->pg_data_size, pg.pg_size) < 0)
|
2020-01-30 22:06:46 +03:00
|
|
|
{
|
2020-03-24 00:18:35 +03:00
|
|
|
finish_op(cur_op, -EIO);
|
2020-02-24 02:40:48 +03:00
|
|
|
return;
|
2020-01-30 22:06:46 +03:00
|
|
|
}
|
2020-02-24 02:40:48 +03:00
|
|
|
// Submit reads
|
|
|
|
op_data->pg_size = pg.pg_size;
|
2020-11-30 00:08:25 +03:00
|
|
|
op_data->scheme = pg.scheme;
|
2020-02-24 02:40:48 +03:00
|
|
|
op_data->degraded = 1;
|
2021-01-12 01:09:59 +03:00
|
|
|
cur_op->buf = alloc_read_buffer(op_data->stripes, pg.pg_size, 0);
|
2022-04-08 11:50:09 +03:00
|
|
|
submit_primary_subops(SUBMIT_RMW_READ, op_data->target_ver, op_data->prev_set, cur_op);
|
2020-02-24 02:40:48 +03:00
|
|
|
op_data->st = 1;
|
2020-01-30 22:06:46 +03:00
|
|
|
}
|
2020-02-24 02:40:48 +03:00
|
|
|
}
|
|
|
|
resume_1:
|
|
|
|
return;
|
|
|
|
resume_2:
|
|
|
|
if (op_data->errors > 0)
|
|
|
|
{
|
2023-02-27 02:12:55 +03:00
|
|
|
if (op_data->errcode == -EIO || op_data->errcode == -EDOM)
|
|
|
|
{
|
|
|
|
// I/O or checksum error
|
|
|
|
auto & pg = pgs.at({ .pool_id = INODE_POOL(op_data->oid.inode), .pg_num = op_data->pg_num });
|
|
|
|
// FIXME: ref = true ideally... because new_state != state is not necessarily true if it's freed and recreated
|
2023-04-10 01:05:41 +03:00
|
|
|
op_data->object_state = mark_object_corrupted(pg, op_data->oid, op_data->object_state, op_data->stripes, false, false);
|
2023-02-27 02:12:55 +03:00
|
|
|
goto resume_0;
|
|
|
|
}
|
2022-12-30 02:03:22 +03:00
|
|
|
finish_op(cur_op, op_data->errcode);
|
2020-02-24 02:40:48 +03:00
|
|
|
return;
|
2020-02-03 12:35:02 +03:00
|
|
|
}
|
2021-06-14 22:42:49 +03:00
|
|
|
cur_op->reply.rw.version = op_data->fact_ver;
|
2021-02-07 16:26:08 +03:00
|
|
|
cur_op->reply.rw.bitmap_len = op_data->pg_data_size * clean_entry_bitmap_size;
|
2020-02-24 02:40:48 +03:00
|
|
|
if (op_data->degraded)
|
|
|
|
{
|
|
|
|
// Reconstruct missing stripes
|
|
|
|
osd_rmw_stripe_t *stripes = op_data->stripes;
|
2020-11-30 00:08:25 +03:00
|
|
|
if (op_data->scheme == POOL_SCHEME_XOR)
|
|
|
|
{
|
2021-02-07 16:26:08 +03:00
|
|
|
reconstruct_stripes_xor(stripes, op_data->pg_size, clean_entry_bitmap_size);
|
2020-11-30 00:08:25 +03:00
|
|
|
}
|
2022-06-03 15:36:58 +03:00
|
|
|
else if (op_data->scheme == POOL_SCHEME_EC)
|
2020-11-30 00:08:25 +03:00
|
|
|
{
|
2022-06-03 15:36:58 +03:00
|
|
|
reconstruct_stripes_ec(stripes, op_data->pg_size, op_data->pg_data_size, clean_entry_bitmap_size);
|
2020-11-30 00:08:25 +03:00
|
|
|
}
|
2021-02-07 16:14:54 +03:00
|
|
|
cur_op->iov.push_back(op_data->stripes[0].bmp_buf, cur_op->reply.rw.bitmap_len);
|
2020-11-30 00:08:25 +03:00
|
|
|
for (int role = 0; role < op_data->pg_size; role++)
|
2020-02-24 02:40:48 +03:00
|
|
|
{
|
|
|
|
if (stripes[role].req_end != 0)
|
|
|
|
{
|
|
|
|
// Send buffer in parts to avoid copying
|
2020-06-18 02:07:20 +03:00
|
|
|
cur_op->iov.push_back(
|
2022-01-15 23:55:10 +03:00
|
|
|
(uint8_t*)stripes[role].read_buf + (stripes[role].req_start - stripes[role].read_start),
|
2020-02-24 02:40:48 +03:00
|
|
|
stripes[role].req_end - stripes[role].req_start
|
|
|
|
);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2020-06-16 01:36:38 +03:00
|
|
|
else
|
|
|
|
{
|
2021-02-07 16:14:54 +03:00
|
|
|
cur_op->iov.push_back(op_data->stripes[0].bmp_buf, cur_op->reply.rw.bitmap_len);
|
2020-06-18 02:07:20 +03:00
|
|
|
cur_op->iov.push_back(cur_op->buf, cur_op->req.rw.len);
|
2020-06-16 01:36:38 +03:00
|
|
|
}
|
2020-03-24 00:18:35 +03:00
|
|
|
finish_op(cur_op, cur_op->req.rw.len);
|
2020-02-03 12:35:02 +03:00
|
|
|
}
|
|
|
|
|
2024-04-07 18:02:05 +03:00
|
|
|
pg_osd_set_state_t *osd_t::mark_object(pg_t & pg, object_id oid, pg_osd_set_state_t *prev_object_state, bool ref,
|
|
|
|
std::function<int(pg_osd_set_t & new_set)> calc_set)
|
2023-02-27 02:12:55 +03:00
|
|
|
{
|
|
|
|
pg_osd_set_state_t *object_state = NULL;
|
|
|
|
get_object_osd_set(pg, oid, &object_state);
|
|
|
|
if (prev_object_state != object_state)
|
|
|
|
{
|
|
|
|
// Object state changed in between by a parallel I/O operation, skip marking as failed
|
|
|
|
if (ref)
|
|
|
|
{
|
|
|
|
deref_object_state(pg, &prev_object_state, ref);
|
|
|
|
if (object_state)
|
|
|
|
object_state->ref_count++;
|
|
|
|
}
|
|
|
|
return object_state;
|
|
|
|
}
|
2024-04-07 18:02:05 +03:00
|
|
|
pg_osd_set_t new_set;
|
2023-02-27 02:12:55 +03:00
|
|
|
if (object_state)
|
|
|
|
{
|
2024-04-07 18:02:05 +03:00
|
|
|
new_set = object_state->osd_set;
|
2023-02-27 02:12:55 +03:00
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
for (int i = 0; i < pg.cur_set.size(); i++)
|
|
|
|
{
|
2024-04-07 18:02:05 +03:00
|
|
|
new_set.push_back((pg_obj_loc_t){
|
2023-02-27 02:12:55 +03:00
|
|
|
.role = (pg.scheme == POOL_SCHEME_REPLICATED ? 0 : (uint64_t)i),
|
|
|
|
.osd_num = pg.cur_set[i],
|
|
|
|
});
|
|
|
|
}
|
|
|
|
}
|
2024-04-07 18:02:05 +03:00
|
|
|
int changes = calc_set(new_set);
|
2023-04-10 01:05:41 +03:00
|
|
|
if (!changes)
|
2023-02-27 02:12:55 +03:00
|
|
|
{
|
|
|
|
// No chunks newly marked as corrupted - object is already marked or moved
|
|
|
|
return object_state;
|
|
|
|
}
|
|
|
|
int old_pg_state = pg.state;
|
|
|
|
if (object_state)
|
|
|
|
{
|
|
|
|
remove_object_from_state(oid, &object_state, pg, false);
|
|
|
|
deref_object_state(pg, &object_state, ref);
|
|
|
|
}
|
2023-04-10 01:05:41 +03:00
|
|
|
// Insert object into the new state and retry
|
2024-04-07 18:02:05 +03:00
|
|
|
object_state = add_object_to_set(pg, oid, new_set, old_pg_state, 2);
|
2023-04-10 01:05:41 +03:00
|
|
|
if (ref)
|
2023-02-27 02:12:55 +03:00
|
|
|
{
|
2023-04-10 01:05:41 +03:00
|
|
|
object_state->ref_count++;
|
2023-02-27 02:12:55 +03:00
|
|
|
}
|
2023-04-10 01:05:41 +03:00
|
|
|
return object_state;
|
|
|
|
}
|
|
|
|
|
2024-04-07 18:02:05 +03:00
|
|
|
pg_osd_set_state_t *osd_t::mark_object_corrupted(pg_t & pg, object_id oid, pg_osd_set_state_t *prev_object_state,
|
|
|
|
osd_rmw_stripe_t *stripes, bool ref, bool inconsistent)
|
|
|
|
{
|
|
|
|
return mark_object(pg, oid, prev_object_state, ref, [stripes, inconsistent](pg_osd_set_t & new_set)
|
|
|
|
{
|
|
|
|
// Mark object chunk(s) as corrupted
|
|
|
|
int changes = 0;
|
|
|
|
for (auto chunk_it = new_set.begin(); chunk_it != new_set.end(); )
|
|
|
|
{
|
|
|
|
auto & chunk = *chunk_it;
|
|
|
|
if (stripes[chunk.role].osd_num == chunk.osd_num)
|
|
|
|
{
|
|
|
|
if (stripes[chunk.role].not_exists)
|
|
|
|
{
|
|
|
|
changes++;
|
|
|
|
new_set.erase(chunk_it, chunk_it+1);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
if (stripes[chunk.role].read_error && chunk.loc_bad != LOC_CORRUPTED)
|
|
|
|
{
|
|
|
|
changes++;
|
|
|
|
chunk.loc_bad = LOC_CORRUPTED;
|
|
|
|
}
|
|
|
|
else if (stripes[chunk.role].read_end > 0 && !stripes[chunk.role].missing &&
|
|
|
|
(chunk.loc_bad & LOC_CORRUPTED))
|
|
|
|
{
|
|
|
|
changes++;
|
|
|
|
chunk.loc_bad &= ~LOC_CORRUPTED;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (inconsistent && !chunk.loc_bad)
|
|
|
|
{
|
|
|
|
changes++;
|
|
|
|
chunk.loc_bad |= LOC_INCONSISTENT;
|
|
|
|
}
|
|
|
|
else if (!inconsistent && (chunk.loc_bad & LOC_INCONSISTENT))
|
|
|
|
{
|
|
|
|
changes++;
|
|
|
|
chunk.loc_bad &= ~LOC_INCONSISTENT;
|
|
|
|
}
|
|
|
|
chunk_it++;
|
|
|
|
}
|
|
|
|
return changes;
|
|
|
|
});
|
|
|
|
}
|
|
|
|
|
|
|
|
// Mark the object as partially updated (probably due to a ENOSPC)
|
|
|
|
pg_osd_set_state_t *osd_t::mark_partial_write(pg_t & pg, object_id oid, pg_osd_set_state_t *prev_object_state,
|
|
|
|
osd_rmw_stripe_t *stripes, bool ref)
|
|
|
|
{
|
|
|
|
return mark_object(pg, oid, prev_object_state, ref, [stripes](pg_osd_set_t & new_set)
|
|
|
|
{
|
|
|
|
// Mark object chunk(s) as outdated
|
|
|
|
int changes = 0;
|
|
|
|
for (auto chunk_it = new_set.begin(); chunk_it != new_set.end(); )
|
|
|
|
{
|
|
|
|
auto & chunk = *chunk_it;
|
|
|
|
if (stripes[chunk.role].osd_num == chunk.osd_num &&
|
|
|
|
stripes[chunk.role].read_error &&
|
|
|
|
chunk.loc_bad != LOC_OUTDATED)
|
|
|
|
{
|
|
|
|
changes++;
|
|
|
|
chunk.loc_bad = LOC_OUTDATED;
|
|
|
|
}
|
|
|
|
chunk_it++;
|
|
|
|
}
|
|
|
|
return changes;
|
|
|
|
});
|
|
|
|
}
|
|
|
|
|
2023-04-10 01:05:41 +03:00
|
|
|
pg_osd_set_state_t* osd_t::add_object_to_set(pg_t & pg, const object_id oid, const pg_osd_set_t & osd_set,
|
|
|
|
uint64_t old_pg_state, int log_at_level)
|
|
|
|
{
|
|
|
|
// Object state will be calculated from <osd_set>
|
|
|
|
uint64_t has_roles = 0, n_roles = 0, n_copies = 0, n_invalid = 0, n_outdated = 0,
|
|
|
|
n_misplaced = 0, n_corrupted = 0, n_inconsistent = 0;
|
|
|
|
for (auto & chunk: osd_set)
|
|
|
|
{
|
|
|
|
if (chunk.role >= (pg.scheme == POOL_SCHEME_REPLICATED ? 1 : pg.pg_size))
|
|
|
|
{
|
|
|
|
n_invalid++;
|
|
|
|
}
|
|
|
|
else if (chunk.loc_bad & LOC_OUTDATED)
|
|
|
|
{
|
|
|
|
n_outdated++;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
if (chunk.loc_bad & LOC_INCONSISTENT)
|
|
|
|
{
|
|
|
|
n_inconsistent++;
|
|
|
|
}
|
|
|
|
if (chunk.loc_bad & LOC_CORRUPTED)
|
|
|
|
{
|
|
|
|
n_corrupted++;
|
|
|
|
}
|
|
|
|
else if (pg.scheme == POOL_SCHEME_REPLICATED)
|
|
|
|
{
|
|
|
|
n_roles = 1;
|
|
|
|
int i;
|
|
|
|
for (i = 0; i < pg.cur_set.size() && pg.cur_set[i] != chunk.osd_num; i++) {}
|
|
|
|
if (i == pg.cur_set.size())
|
|
|
|
{
|
|
|
|
n_misplaced++;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
if (!(has_roles & (1 << chunk.role)))
|
|
|
|
{
|
|
|
|
n_roles++;
|
|
|
|
has_roles |= (1 << chunk.role);
|
|
|
|
}
|
|
|
|
if (pg.cur_set[chunk.role] != chunk.osd_num)
|
|
|
|
{
|
|
|
|
n_misplaced++;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
n_copies++;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
uint64_t obj_state = 0;
|
|
|
|
int pg_state_bits = 0;
|
|
|
|
if (n_corrupted > 0)
|
|
|
|
{
|
|
|
|
this->corrupted_objects++;
|
|
|
|
pg.corrupted_count++;
|
|
|
|
obj_state |= OBJ_CORRUPTED;
|
|
|
|
pg_state_bits |= PG_HAS_CORRUPTED;
|
|
|
|
}
|
|
|
|
if (n_invalid > 0 || n_inconsistent > 0)
|
|
|
|
{
|
|
|
|
this->inconsistent_objects++;
|
|
|
|
obj_state |= OBJ_INCONSISTENT;
|
|
|
|
pg_state_bits |= PG_HAS_INCONSISTENT;
|
|
|
|
}
|
|
|
|
else if (n_roles < pg.pg_data_size)
|
2023-02-27 02:12:55 +03:00
|
|
|
{
|
|
|
|
this->incomplete_objects++;
|
|
|
|
obj_state |= OBJ_INCOMPLETE;
|
|
|
|
pg_state_bits = PG_HAS_INCOMPLETE;
|
|
|
|
}
|
|
|
|
else if (n_roles < pg.pg_cursize)
|
|
|
|
{
|
|
|
|
this->degraded_objects++;
|
|
|
|
obj_state |= OBJ_DEGRADED;
|
|
|
|
pg_state_bits = PG_HAS_DEGRADED;
|
|
|
|
}
|
2023-04-10 01:05:41 +03:00
|
|
|
else if (n_misplaced > 0 || n_outdated > 0)
|
2023-02-27 02:12:55 +03:00
|
|
|
{
|
|
|
|
this->misplaced_objects++;
|
|
|
|
obj_state |= OBJ_MISPLACED;
|
|
|
|
pg_state_bits = PG_HAS_MISPLACED;
|
|
|
|
}
|
2023-04-10 01:05:41 +03:00
|
|
|
if (this->log_level >= log_at_level)
|
|
|
|
{
|
2024-02-21 19:05:15 +03:00
|
|
|
printf("Marking object %jx:%jx ", oid.inode, oid.stripe);
|
2023-04-10 01:05:41 +03:00
|
|
|
for (int i = 0, j = 0; i < object_state_bit_count; i++)
|
|
|
|
{
|
|
|
|
if ((obj_state & object_state_bits[i]) || object_state_bits[i] == 0 && obj_state == 0)
|
|
|
|
{
|
|
|
|
printf((j++) ? "+%s" : "%s", object_state_names[i]);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (pg.scheme == POOL_SCHEME_REPLICATED)
|
|
|
|
{
|
2024-02-21 19:05:15 +03:00
|
|
|
printf(": %ju copies available", n_copies);
|
2023-04-10 01:05:41 +03:00
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
2024-02-21 19:05:15 +03:00
|
|
|
printf(": %ju parts / %ju copies available", n_roles, n_copies);
|
2023-04-10 01:05:41 +03:00
|
|
|
}
|
|
|
|
if (n_invalid > 0)
|
|
|
|
{
|
2024-02-21 19:05:15 +03:00
|
|
|
printf(", %ju invalid", n_invalid);
|
2023-04-10 01:05:41 +03:00
|
|
|
}
|
|
|
|
if (n_outdated > 0)
|
|
|
|
{
|
2024-02-21 19:05:15 +03:00
|
|
|
printf(", %ju outdated", n_outdated);
|
2023-04-10 01:05:41 +03:00
|
|
|
}
|
|
|
|
if (n_misplaced > 0)
|
|
|
|
{
|
2024-02-21 19:05:15 +03:00
|
|
|
printf(", %ju misplaced", n_misplaced);
|
2023-04-10 01:05:41 +03:00
|
|
|
}
|
|
|
|
if (n_corrupted > 0)
|
|
|
|
{
|
2024-02-21 19:05:15 +03:00
|
|
|
printf(", %ju corrupted", n_corrupted);
|
2023-04-10 01:05:41 +03:00
|
|
|
}
|
|
|
|
if (n_inconsistent > 0)
|
|
|
|
{
|
2024-02-21 19:05:15 +03:00
|
|
|
printf(", %ju inconsistent", n_inconsistent);
|
2023-04-10 01:05:41 +03:00
|
|
|
}
|
|
|
|
printf("\n");
|
|
|
|
}
|
2023-02-27 02:12:55 +03:00
|
|
|
pg.state |= pg_state_bits;
|
|
|
|
if (pg.state != old_pg_state)
|
|
|
|
{
|
|
|
|
report_pg_state(pg);
|
|
|
|
if ((pg.state & (PG_HAS_DEGRADED | PG_HAS_MISPLACED)) !=
|
|
|
|
(old_pg_state & (PG_HAS_DEGRADED | PG_HAS_MISPLACED)))
|
|
|
|
{
|
|
|
|
peering_state = peering_state | OSD_RECOVERING;
|
|
|
|
if ((pg.state & PG_HAS_DEGRADED) != (old_pg_state & PG_HAS_DEGRADED))
|
|
|
|
{
|
|
|
|
// Restart recovery from degraded objects
|
|
|
|
recovery_last_degraded = true;
|
|
|
|
recovery_last_pg = {};
|
|
|
|
recovery_last_oid = {};
|
|
|
|
}
|
|
|
|
ringloop->wakeup();
|
|
|
|
}
|
|
|
|
}
|
2023-04-10 01:05:41 +03:00
|
|
|
if (!obj_state)
|
|
|
|
{
|
|
|
|
// Object is clean
|
|
|
|
return NULL;
|
|
|
|
}
|
2023-02-27 02:12:55 +03:00
|
|
|
// Insert object into the new state and retry
|
2023-04-10 01:05:41 +03:00
|
|
|
return pg.add_object_to_state(oid, obj_state, osd_set);
|
2023-02-27 02:12:55 +03:00
|
|
|
}
|
|
|
|
|
2020-05-05 00:16:01 +03:00
|
|
|
// Decrement pg_osd_set_state_t's object_count and change PG state accordingly
|
2023-02-27 02:12:55 +03:00
|
|
|
void osd_t::remove_object_from_state(object_id & oid, pg_osd_set_state_t **object_state, pg_t & pg, bool report)
|
2020-05-05 00:16:01 +03:00
|
|
|
{
|
2023-01-24 02:26:52 +03:00
|
|
|
if (!*object_state)
|
|
|
|
{
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
pg_osd_set_state_t *recheck_state = NULL;
|
2023-02-27 02:12:55 +03:00
|
|
|
get_object_osd_set(pg, oid, &recheck_state);
|
2023-01-24 02:26:52 +03:00
|
|
|
if (recheck_state != *object_state)
|
|
|
|
{
|
|
|
|
recheck_state->ref_count++;
|
|
|
|
(*object_state)->ref_count--;
|
|
|
|
*object_state = recheck_state;
|
|
|
|
return;
|
|
|
|
}
|
2023-04-10 01:05:41 +03:00
|
|
|
bool changed = false;
|
2023-01-24 02:26:52 +03:00
|
|
|
(*object_state)->object_count--;
|
2023-02-27 02:12:55 +03:00
|
|
|
if ((*object_state)->state & OBJ_CORRUPTED)
|
|
|
|
{
|
|
|
|
this->corrupted_objects--;
|
|
|
|
pg.corrupted_count--;
|
2023-04-10 01:05:41 +03:00
|
|
|
if (!pg.corrupted_count)
|
|
|
|
{
|
|
|
|
pg.state = pg.state & ~PG_HAS_CORRUPTED;
|
|
|
|
changed = true;
|
|
|
|
}
|
2023-02-27 02:12:55 +03:00
|
|
|
}
|
2023-04-10 01:05:41 +03:00
|
|
|
if ((*object_state)->state & OBJ_INCONSISTENT)
|
|
|
|
{
|
|
|
|
this->inconsistent_objects--;
|
|
|
|
pg.inconsistent_objects.erase(oid);
|
|
|
|
if (!pg.inconsistent_objects.size())
|
|
|
|
{
|
|
|
|
pg.state = pg.state & ~PG_HAS_INCONSISTENT;
|
|
|
|
changed = true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
else if ((*object_state)->state & OBJ_INCOMPLETE)
|
2020-05-05 00:16:01 +03:00
|
|
|
{
|
|
|
|
// Successful write means that object is not incomplete anymore
|
|
|
|
this->incomplete_objects--;
|
|
|
|
pg.incomplete_objects.erase(oid);
|
|
|
|
if (!pg.incomplete_objects.size())
|
|
|
|
{
|
|
|
|
pg.state = pg.state & ~PG_HAS_INCOMPLETE;
|
2023-02-27 02:12:55 +03:00
|
|
|
changed = true;
|
2020-05-05 00:16:01 +03:00
|
|
|
}
|
|
|
|
}
|
2023-01-24 02:26:52 +03:00
|
|
|
else if ((*object_state)->state & OBJ_DEGRADED)
|
2020-05-05 00:16:01 +03:00
|
|
|
{
|
|
|
|
this->degraded_objects--;
|
|
|
|
pg.degraded_objects.erase(oid);
|
|
|
|
if (!pg.degraded_objects.size())
|
|
|
|
{
|
|
|
|
pg.state = pg.state & ~PG_HAS_DEGRADED;
|
2023-02-27 02:12:55 +03:00
|
|
|
changed = true;
|
2020-05-05 00:16:01 +03:00
|
|
|
}
|
|
|
|
}
|
2023-01-24 02:26:52 +03:00
|
|
|
else if ((*object_state)->state & OBJ_MISPLACED)
|
2020-05-05 00:16:01 +03:00
|
|
|
{
|
|
|
|
this->misplaced_objects--;
|
|
|
|
pg.misplaced_objects.erase(oid);
|
|
|
|
if (!pg.misplaced_objects.size())
|
|
|
|
{
|
|
|
|
pg.state = pg.state & ~PG_HAS_MISPLACED;
|
2023-02-27 02:12:55 +03:00
|
|
|
changed = true;
|
2020-05-05 00:16:01 +03:00
|
|
|
}
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
2023-01-24 02:26:52 +03:00
|
|
|
throw std::runtime_error("BUG: Invalid object state: "+std::to_string((*object_state)->state));
|
2020-05-05 00:16:01 +03:00
|
|
|
}
|
2023-02-27 02:12:55 +03:00
|
|
|
if (changed && report)
|
|
|
|
{
|
|
|
|
report_pg_state(pg);
|
|
|
|
}
|
2021-03-21 15:02:24 +03:00
|
|
|
}
|
|
|
|
|
2023-01-24 02:26:52 +03:00
|
|
|
void osd_t::deref_object_state(pg_t & pg, pg_osd_set_state_t **object_state, bool deref)
|
2021-03-21 15:02:24 +03:00
|
|
|
{
|
2023-01-24 02:26:52 +03:00
|
|
|
if (*object_state)
|
2020-05-05 00:16:01 +03:00
|
|
|
{
|
2023-01-24 02:26:52 +03:00
|
|
|
if (deref)
|
|
|
|
{
|
|
|
|
(*object_state)->ref_count--;
|
|
|
|
}
|
|
|
|
if (!(*object_state)->object_count && !(*object_state)->ref_count)
|
|
|
|
{
|
|
|
|
pg.state_dict.erase((*object_state)->osd_set);
|
|
|
|
*object_state = NULL;
|
|
|
|
}
|
2020-05-05 00:16:01 +03:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void osd_t::continue_primary_del(osd_op_t *cur_op)
|
|
|
|
{
|
|
|
|
if (!cur_op->op_data && !prepare_primary_rw(cur_op))
|
|
|
|
{
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
osd_primary_op_data_t *op_data = cur_op->op_data;
|
2021-02-28 19:36:14 +03:00
|
|
|
auto & pg = pgs.at({ .pool_id = INODE_POOL(op_data->oid.inode), .pg_num = op_data->pg_num });
|
2020-05-05 00:16:01 +03:00
|
|
|
if (op_data->st == 1) goto resume_1;
|
|
|
|
else if (op_data->st == 2) goto resume_2;
|
|
|
|
else if (op_data->st == 3) goto resume_3;
|
|
|
|
else if (op_data->st == 4) goto resume_4;
|
|
|
|
else if (op_data->st == 5) goto resume_5;
|
|
|
|
assert(op_data->st == 0);
|
|
|
|
if (!check_write_queue(cur_op, pg))
|
|
|
|
{
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
resume_1:
|
|
|
|
// Determine which OSDs contain this object and delete it
|
2023-02-27 02:12:55 +03:00
|
|
|
op_data->prev_set = get_object_osd_set(pg, op_data->oid, &op_data->object_state);
|
2023-01-24 02:26:52 +03:00
|
|
|
if (op_data->object_state)
|
|
|
|
{
|
|
|
|
op_data->object_state->ref_count++;
|
|
|
|
}
|
2020-05-05 00:16:01 +03:00
|
|
|
// Submit 1 read to determine the actual version number
|
2021-03-23 00:26:22 +03:00
|
|
|
submit_primary_subops(SUBMIT_RMW_READ, UINT64_MAX, op_data->prev_set, cur_op);
|
2023-02-27 02:12:55 +03:00
|
|
|
op_data->prev_set = NULL;
|
2020-05-05 00:16:01 +03:00
|
|
|
resume_2:
|
|
|
|
op_data->st = 2;
|
|
|
|
return;
|
|
|
|
resume_3:
|
|
|
|
if (op_data->errors > 0)
|
|
|
|
{
|
2023-01-24 02:26:52 +03:00
|
|
|
deref_object_state(pg, &op_data->object_state, true);
|
2022-12-30 02:03:22 +03:00
|
|
|
pg_cancel_write_queue(pg, cur_op, op_data->oid, op_data->errcode);
|
2020-05-05 00:16:01 +03:00
|
|
|
return;
|
|
|
|
}
|
2021-06-14 22:42:49 +03:00
|
|
|
// Check CAS version
|
|
|
|
if (cur_op->req.rw.version && op_data->fact_ver != (cur_op->req.rw.version-1))
|
|
|
|
{
|
2023-01-24 02:26:52 +03:00
|
|
|
deref_object_state(pg, &op_data->object_state, true);
|
2021-06-14 22:42:49 +03:00
|
|
|
cur_op->reply.hdr.retval = -EINTR;
|
2021-08-01 20:05:19 +03:00
|
|
|
cur_op->reply.rw.version = op_data->fact_ver;
|
2021-06-14 22:42:49 +03:00
|
|
|
goto continue_others;
|
|
|
|
}
|
2020-05-05 00:16:01 +03:00
|
|
|
// Save version override for parallel reads
|
|
|
|
pg.ver_override[op_data->oid] = op_data->fact_ver;
|
|
|
|
// Submit deletes
|
|
|
|
op_data->fact_ver++;
|
2020-09-04 22:17:44 +03:00
|
|
|
submit_primary_del_subops(cur_op, NULL, 0, op_data->object_state ? op_data->object_state->osd_set : pg.cur_loc_set);
|
2020-05-05 00:16:01 +03:00
|
|
|
resume_4:
|
|
|
|
op_data->st = 4;
|
|
|
|
return;
|
|
|
|
resume_5:
|
|
|
|
if (op_data->errors > 0)
|
|
|
|
{
|
2023-01-24 02:26:52 +03:00
|
|
|
deref_object_state(pg, &op_data->object_state, true);
|
2022-12-30 02:03:22 +03:00
|
|
|
pg_cancel_write_queue(pg, cur_op, op_data->oid, op_data->errcode);
|
2020-05-05 00:16:01 +03:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
// Remove version override
|
|
|
|
pg.ver_override.erase(op_data->oid);
|
|
|
|
// Adjust PG stats after "instant stabilize", because we need object_state above
|
|
|
|
if (!op_data->object_state)
|
|
|
|
{
|
|
|
|
pg.clean_count--;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
2023-01-24 02:26:52 +03:00
|
|
|
remove_object_from_state(op_data->oid, &op_data->object_state, pg);
|
|
|
|
deref_object_state(pg, &op_data->object_state, true);
|
2020-05-05 00:16:01 +03:00
|
|
|
}
|
2024-02-03 20:30:42 +03:00
|
|
|
// Mark PG and OSDs as dirty
|
|
|
|
for (auto & chunk: (op_data->object_state ? op_data->object_state->osd_set : pg.cur_loc_set))
|
|
|
|
{
|
|
|
|
this->dirty_osds.insert(chunk.osd_num);
|
|
|
|
}
|
|
|
|
for (auto cl_it = msgr.clients.find(cur_op->peer_fd); cl_it != msgr.clients.end(); )
|
|
|
|
{
|
|
|
|
cl_it->second->dirty_pgs.insert({ .pool_id = pg.pool_id, .pg_num = pg.pg_num });
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
dirty_pgs.insert({ .pool_id = pg.pool_id, .pg_num = pg.pg_num });
|
|
|
|
if (immediate_commit == IMMEDIATE_NONE)
|
|
|
|
{
|
|
|
|
unstable_write_count++;
|
|
|
|
if (unstable_write_count >= autosync_writes)
|
|
|
|
{
|
|
|
|
unstable_write_count = 0;
|
|
|
|
autosync();
|
|
|
|
}
|
|
|
|
}
|
2020-05-05 00:16:01 +03:00
|
|
|
pg.total_count--;
|
2021-06-14 22:42:49 +03:00
|
|
|
cur_op->reply.hdr.retval = 0;
|
|
|
|
continue_others:
|
2021-04-07 01:29:30 +03:00
|
|
|
osd_op_t *next_op = NULL;
|
|
|
|
auto next_it = pg.write_queue.find(op_data->oid);
|
|
|
|
if (next_it != pg.write_queue.end() && next_it->second == cur_op)
|
|
|
|
{
|
|
|
|
pg.write_queue.erase(next_it++);
|
|
|
|
if (next_it != pg.write_queue.end() && next_it->first == op_data->oid)
|
|
|
|
next_op = next_it->second;
|
|
|
|
}
|
2021-06-14 22:42:49 +03:00
|
|
|
finish_op(cur_op, cur_op->reply.hdr.retval);
|
2021-04-07 01:29:30 +03:00
|
|
|
if (next_op)
|
2020-05-05 00:16:01 +03:00
|
|
|
{
|
2021-04-07 01:29:30 +03:00
|
|
|
// Continue next write to the same object
|
|
|
|
continue_primary_write(next_op);
|
2020-05-05 00:16:01 +03:00
|
|
|
}
|
|
|
|
}
|