2020-09-17 23:02:40 +03:00
// Copyright (c) Vitaliy Filippov, 2019+
2021-02-06 01:26:07 +03:00
// License: VNPL-1.1 (see README.md for details)
2020-09-17 23:02:40 +03:00
2020-05-03 11:04:20 +03:00
# include "osd_primary.h"
2023-12-09 15:36:00 +03:00
# define SELF_FD -1
2020-05-03 11:04:20 +03:00
void osd_t : : autosync ( )
{
if ( immediate_commit ! = IMMEDIATE_ALL & & ! autosync_op )
{
autosync_op = new osd_op_t ( ) ;
autosync_op - > op_type = OSD_OP_IN ;
2023-12-09 15:36:00 +03:00
autosync_op - > peer_fd = SELF_FD ;
2020-11-09 00:07:07 +03:00
autosync_op - > req = ( osd_any_op_t ) {
2020-05-03 11:04:20 +03:00
. sync = {
. header = {
. magic = SECONDARY_OSD_OP_MAGIC ,
. id = 1 ,
. opcode = OSD_OP_SYNC ,
} ,
} ,
} ;
autosync_op - > callback = [ this ] ( osd_op_t * op )
{
if ( op - > reply . hdr . retval < 0 )
{
printf ( " Warning: automatic sync resulted in an error: %ld (%s) \n " , - op - > reply . hdr . retval , strerror ( - op - > reply . hdr . retval ) ) ;
}
delete autosync_op ;
autosync_op = NULL ;
} ;
exec_op ( autosync_op ) ;
}
}
void osd_t : : finish_op ( osd_op_t * cur_op , int retval )
{
inflight_ops - - ;
2021-01-21 00:30:18 +03:00
if ( cur_op - > req . hdr . opcode = = OSD_OP_READ | |
cur_op - > req . hdr . opcode = = OSD_OP_WRITE | |
cur_op - > req . hdr . opcode = = OSD_OP_DELETE )
{
// Track inode statistics
if ( ! cur_op - > tv_end . tv_sec )
{
clock_gettime ( CLOCK_REALTIME , & cur_op - > tv_end ) ;
}
uint64_t usec = (
( cur_op - > tv_end . tv_sec - cur_op - > tv_begin . tv_sec ) * 1000000 +
( cur_op - > tv_end . tv_nsec - cur_op - > tv_begin . tv_nsec ) / 1000
) ;
int inode_st_op = cur_op - > req . hdr . opcode = = OSD_OP_DELETE
? INODE_STATS_DELETE
: ( cur_op - > req . hdr . opcode = = OSD_OP_READ ? INODE_STATS_READ : INODE_STATS_WRITE ) ;
inode_stats [ cur_op - > req . rw . inode ] . op_count [ inode_st_op ] + + ;
inode_stats [ cur_op - > req . rw . inode ] . op_sum [ inode_st_op ] + = usec ;
if ( cur_op - > req . hdr . opcode = = OSD_OP_DELETE )
2023-03-02 11:16:35 +03:00
{
if ( cur_op - > op_data )
inode_stats [ cur_op - > req . rw . inode ] . op_bytes [ inode_st_op ] + = cur_op - > op_data - > pg_data_size * bs_block_size ;
}
2021-01-21 00:30:18 +03:00
else
inode_stats [ cur_op - > req . rw . inode ] . op_bytes [ inode_st_op ] + = cur_op - > req . rw . len ;
}
2020-06-01 01:53:32 +03:00
if ( cur_op - > op_data )
2020-05-03 11:04:20 +03:00
{
2020-06-01 01:53:32 +03:00
if ( cur_op - > op_data - > pg_num > 0 )
2020-05-03 11:04:20 +03:00
{
2021-02-28 19:36:14 +03:00
auto & pg = pgs . at ( { . pool_id = INODE_POOL ( cur_op - > op_data - > oid . inode ) , . pg_num = cur_op - > op_data - > pg_num } ) ;
2020-06-01 01:53:32 +03:00
pg . inflight - - ;
assert ( pg . inflight > = 0 ) ;
2021-04-07 01:29:30 +03:00
if ( ( pg . state & PG_STOPPING ) & & pg . inflight = = 0 & & ! pg . flush_batch )
2020-06-01 01:53:32 +03:00
{
finish_stop_pg ( pg ) ;
}
2021-04-07 01:29:30 +03:00
else if ( ( pg . state & PG_REPEERING ) & & pg . inflight = = 0 & & ! pg . flush_batch )
{
start_pg_peering ( pg ) ;
}
2020-05-03 11:04:20 +03:00
}
2020-06-01 01:53:32 +03:00
assert ( ! cur_op - > op_data - > subops ) ;
free ( cur_op - > op_data ) ;
cur_op - > op_data = NULL ;
2020-05-03 11:04:20 +03:00
}
2023-04-18 02:08:43 +03:00
cur_op - > reply . hdr . magic = SECONDARY_OSD_REPLY_MAGIC ;
cur_op - > reply . hdr . id = cur_op - > req . hdr . id ;
cur_op - > reply . hdr . opcode = cur_op - > req . hdr . opcode ;
cur_op - > reply . hdr . retval = retval ;
2023-12-09 15:36:00 +03:00
if ( cur_op - > peer_fd = = SELF_FD )
2020-05-03 11:04:20 +03:00
{
2023-12-09 15:36:00 +03:00
// Do not include internal primary writes (recovery/rebalance) into client op statistics
if ( cur_op - > req . hdr . opcode ! = OSD_OP_WRITE )
{
msgr . measure_exec ( cur_op ) ;
}
2020-05-03 11:04:20 +03:00
// Copy lambda to be unaffected by `delete op`
std : : function < void ( osd_op_t * ) > ( cur_op - > callback ) ( cur_op ) ;
}
else
{
2021-01-21 00:30:18 +03:00
// FIXME add separate magic number for primary ops
2021-04-17 16:32:09 +03:00
auto cl_it = msgr . clients . find ( cur_op - > peer_fd ) ;
if ( cl_it ! = msgr . clients . end ( ) )
2020-05-03 11:04:20 +03:00
{
2021-04-17 16:32:09 +03:00
msgr . outbox_push ( cur_op ) ;
2020-05-03 11:04:20 +03:00
}
else
{
delete cur_op ;
}
}
}
2021-03-23 00:26:22 +03:00
void osd_t : : submit_primary_subops ( int submit_type , uint64_t op_version , const uint64_t * osd_set , osd_op_t * cur_op )
2020-05-03 11:04:20 +03:00
{
2020-09-04 22:17:44 +03:00
bool wr = submit_type = = SUBMIT_WRITE ;
2020-05-03 11:04:20 +03:00
osd_primary_op_data_t * op_data = cur_op - > op_data ;
osd_rmw_stripe_t * stripes = op_data - > stripes ;
2020-09-04 22:17:44 +03:00
bool rep = op_data - > scheme = = POOL_SCHEME_REPLICATED ;
2020-05-03 11:04:20 +03:00
// Allocate subops
int n_subops = 0 , zero_read = - 1 ;
2021-03-23 00:26:22 +03:00
for ( int role = 0 ; role < op_data - > pg_size ; role + + )
2020-05-03 11:04:20 +03:00
{
if ( osd_set [ role ] = = this - > osd_num | | osd_set [ role ] ! = 0 & & zero_read = = - 1 )
zero_read = role ;
2020-09-04 22:17:44 +03:00
if ( osd_set [ role ] ! = 0 & & ( wr | | ! rep & & stripes [ role ] . read_end ! = 0 ) )
2020-05-03 11:04:20 +03:00
n_subops + + ;
}
2020-09-04 22:17:44 +03:00
if ( ! n_subops & & ( submit_type = = SUBMIT_RMW_READ | | rep ) )
2020-05-03 11:04:20 +03:00
n_subops = 1 ;
else
zero_read = - 1 ;
osd_op_t * subops = new osd_op_t [ n_subops ] ;
op_data - > fact_ver = 0 ;
2022-12-30 02:03:22 +03:00
op_data - > done = op_data - > errors = op_data - > errcode = 0 ;
2020-05-03 11:04:20 +03:00
op_data - > n_subops = n_subops ;
op_data - > subops = subops ;
2021-03-23 00:26:22 +03:00
int sent = submit_primary_subop_batch ( submit_type , op_data - > oid . inode , op_version , op_data - > stripes , osd_set , cur_op , 0 , zero_read ) ;
assert ( sent = = n_subops ) ;
}
int osd_t : : submit_primary_subop_batch ( int submit_type , inode_t inode , uint64_t op_version ,
osd_rmw_stripe_t * stripes , const uint64_t * osd_set , osd_op_t * cur_op , int subop_idx , int zero_read )
{
bool wr = submit_type = = SUBMIT_WRITE ;
osd_primary_op_data_t * op_data = cur_op - > op_data ;
bool rep = op_data - > scheme = = POOL_SCHEME_REPLICATED ;
int i = subop_idx ;
for ( int role = 0 ; role < op_data - > pg_size ; role + + )
2020-05-03 11:04:20 +03:00
{
// We always submit zero-length writes to all replicas, even if the stripe is not modified
2023-02-13 02:59:38 +03:00
if ( ! ( wr | | ! rep & & stripes [ role ] . read_end ! = 0 | | zero_read = = role | | submit_type = = SUBMIT_SCRUB_READ ) )
2020-05-03 11:04:20 +03:00
{
continue ;
}
osd_num_t role_osd_num = osd_set [ role ] ;
2023-02-27 02:12:55 +03:00
int stripe_num = rep ? 0 : role ;
2023-04-18 02:08:43 +03:00
osd_rmw_stripe_t * si = stripes + ( submit_type = = SUBMIT_SCRUB_READ ? role : stripe_num ) ;
2020-05-03 11:04:20 +03:00
if ( role_osd_num ! = 0 )
{
2021-03-23 00:26:22 +03:00
osd_op_t * subop = op_data - > subops + i ;
2023-05-07 14:00:02 +03:00
uint32_t subop_len = wr
2023-04-18 02:08:43 +03:00
? si - > write_end - si - > write_start
: si - > read_end - si - > read_start ;
if ( ! wr & & si - > read_end = = UINT32_MAX )
2023-05-07 14:00:02 +03:00
{
subop_len = 0 ;
}
2023-04-18 02:08:43 +03:00
si - > osd_num = role_osd_num ;
si - > read_error = false ;
subop - > bitmap = si - > bmp_buf ;
2023-02-27 02:12:55 +03:00
subop - > bitmap_len = clean_entry_bitmap_size ;
// Using rmw_buf to pass pointer to stripes. Dirty but should work
2023-04-18 02:08:43 +03:00
subop - > rmw_buf = si ;
2020-05-03 11:04:20 +03:00
if ( role_osd_num = = this - > osd_num )
{
2021-03-23 00:26:22 +03:00
clock_gettime ( CLOCK_REALTIME , & subop - > tv_begin ) ;
subop - > op_type = ( uint64_t ) cur_op ;
2023-02-15 00:55:40 +03:00
subop - > bs_op = new blockstore_op_t ( ( blockstore_op_t ) {
2020-09-04 22:17:44 +03:00
. opcode = ( uint64_t ) ( wr ? ( rep ? BS_OP_WRITE_STABLE : BS_OP_WRITE ) : BS_OP_READ ) ,
2021-03-23 00:26:22 +03:00
. callback = [ subop , this ] ( blockstore_op_t * bs_subop )
2020-05-03 11:04:20 +03:00
{
2020-05-11 02:58:13 +03:00
handle_primary_bs_subop ( subop ) ;
2020-05-03 11:04:20 +03:00
} ,
2023-05-14 17:41:35 +03:00
{ {
2023-02-15 00:55:40 +03:00
. oid = ( object_id ) {
. inode = inode ,
. stripe = op_data - > oid . stripe | stripe_num ,
} ,
. version = op_version ,
2023-04-18 02:08:43 +03:00
. offset = wr ? si - > write_start : si - > read_start ,
2023-02-15 00:55:40 +03:00
. len = subop_len ,
2023-05-14 17:41:35 +03:00
} } ,
2023-04-18 02:08:43 +03:00
. buf = wr ? si - > write_buf : si - > read_buf ,
. bitmap = si - > bmp_buf ,
2020-05-03 11:04:20 +03:00
} ) ;
2020-06-02 18:44:23 +03:00
# ifdef OSD_DEBUG
printf (
2020-09-05 17:20:08 +03:00
" Submit %s to local: %lx:%lx v%lu %u-%u \n " , wr ? " write " : " read " ,
2021-03-23 00:26:22 +03:00
inode , op_data - > oid . stripe | stripe_num , op_version ,
subop - > bs_op - > offset , subop - > bs_op - > len
2020-06-02 18:44:23 +03:00
) ;
# endif
2021-03-23 00:26:22 +03:00
bs - > enqueue_op ( subop - > bs_op ) ;
2020-05-03 11:04:20 +03:00
}
else
{
2021-03-23 00:26:22 +03:00
subop - > op_type = OSD_OP_OUT ;
subop - > req . sec_rw = {
2020-05-03 11:04:20 +03:00
. header = {
. magic = SECONDARY_OSD_OP_MAGIC ,
2021-04-17 16:32:09 +03:00
. id = msgr . next_subop_id + + ,
2020-09-04 22:17:44 +03:00
. opcode = ( uint64_t ) ( wr ? ( rep ? OSD_OP_SEC_WRITE_STABLE : OSD_OP_SEC_WRITE ) : OSD_OP_SEC_READ ) ,
2020-05-03 11:04:20 +03:00
} ,
. oid = {
2021-03-23 00:26:22 +03:00
. inode = inode ,
2020-09-04 22:17:44 +03:00
. stripe = op_data - > oid . stripe | stripe_num ,
2020-05-03 11:04:20 +03:00
} ,
. version = op_version ,
2023-04-18 02:08:43 +03:00
. offset = wr ? si - > write_start : si - > read_start ,
2023-05-07 14:00:02 +03:00
. len = subop_len ,
2021-02-07 16:26:08 +03:00
. attr_len = wr ? clean_entry_bitmap_size : 0 ,
2020-05-03 11:04:20 +03:00
} ;
2020-06-02 18:44:23 +03:00
# ifdef OSD_DEBUG
printf (
2020-09-05 17:20:08 +03:00
" Submit %s to osd %lu: %lx:%lx v%lu %u-%u \n " , wr ? " write " : " read " , role_osd_num ,
2021-03-23 00:26:22 +03:00
inode , op_data - > oid . stripe | stripe_num , op_version ,
subop - > req . sec_rw . offset , subop - > req . sec_rw . len
2020-06-02 18:44:23 +03:00
) ;
# endif
2020-09-04 22:17:44 +03:00
if ( wr )
2020-05-03 11:04:20 +03:00
{
2023-04-18 02:08:43 +03:00
if ( si - > write_end > si - > write_start )
2020-06-18 02:07:20 +03:00
{
2023-04-18 02:08:43 +03:00
subop - > iov . push_back ( si - > write_buf , si - > write_end - si - > write_start ) ;
2020-06-18 02:07:20 +03:00
}
}
else
{
2023-05-07 14:00:02 +03:00
if ( subop_len > 0 )
2020-06-18 02:07:20 +03:00
{
2023-04-18 02:08:43 +03:00
subop - > iov . push_back ( si - > read_buf , subop_len ) ;
2020-06-18 02:07:20 +03:00
}
2020-05-03 11:04:20 +03:00
}
2021-03-23 00:26:22 +03:00
subop - > callback = [ cur_op , this ] ( osd_op_t * subop )
2020-05-03 11:04:20 +03:00
{
2020-06-02 18:44:23 +03:00
handle_primary_subop ( subop , cur_op ) ;
2020-05-03 11:04:20 +03:00
} ;
2022-02-09 10:35:29 +03:00
auto peer_fd_it = msgr . osd_peer_fds . find ( role_osd_num ) ;
if ( peer_fd_it ! = msgr . osd_peer_fds . end ( ) )
{
subop - > peer_fd = peer_fd_it - > second ;
msgr . outbox_push ( subop ) ;
}
else
{
// Fail it immediately
2022-05-27 02:07:26 +03:00
subop - > peer_fd = - 1 ;
2022-02-09 10:35:29 +03:00
subop - > reply . hdr . retval = - EPIPE ;
2023-02-07 01:59:34 +03:00
ringloop - > set_immediate ( [ subop ] ( ) { std : : function < void ( osd_op_t * ) > ( subop - > callback ) ( subop ) ; } ) ;
2022-02-09 10:35:29 +03:00
}
2020-05-03 11:04:20 +03:00
}
2020-05-11 02:58:13 +03:00
i + + ;
2020-05-03 11:04:20 +03:00
}
2023-02-27 02:12:55 +03:00
else
{
2023-04-18 02:08:43 +03:00
si - > osd_num = 0 ;
2023-02-27 02:12:55 +03:00
}
2020-05-03 11:04:20 +03:00
}
2021-03-23 00:26:22 +03:00
return i - subop_idx ;
2020-05-03 11:04:20 +03:00
}
2020-05-11 02:58:13 +03:00
static uint64_t bs_op_to_osd_op [ ] = {
0 ,
2020-09-04 22:17:44 +03:00
OSD_OP_SEC_READ , // BS_OP_READ = 1
OSD_OP_SEC_WRITE , // BS_OP_WRITE = 2
OSD_OP_SEC_WRITE_STABLE , // BS_OP_WRITE_STABLE = 3
OSD_OP_SEC_SYNC , // BS_OP_SYNC = 4
OSD_OP_SEC_STABILIZE , // BS_OP_STABLE = 5
OSD_OP_SEC_DELETE , // BS_OP_DELETE = 6
OSD_OP_SEC_LIST , // BS_OP_LIST = 7
OSD_OP_SEC_ROLLBACK , // BS_OP_ROLLBACK = 8
OSD_OP_TEST_SYNC_STAB_ALL , // BS_OP_SYNC_STAB_ALL = 9
2020-05-11 02:58:13 +03:00
} ;
void osd_t : : handle_primary_bs_subop ( osd_op_t * subop )
{
2020-05-23 14:48:54 +03:00
osd_op_t * cur_op = ( osd_op_t * ) subop - > op_type ;
2020-05-11 02:58:13 +03:00
blockstore_op_t * bs_op = subop - > bs_op ;
2020-09-04 22:17:44 +03:00
int expected = bs_op - > opcode = = BS_OP_READ | | bs_op - > opcode = = BS_OP_WRITE
| | bs_op - > opcode = = BS_OP_WRITE_STABLE ? bs_op - > len : 0 ;
2022-12-29 00:19:18 +03:00
if ( bs_op - > retval ! = expected & & bs_op - > opcode ! = BS_OP_READ & &
( bs_op - > opcode ! = BS_OP_WRITE & & bs_op - > opcode ! = BS_OP_WRITE_STABLE | |
bs_op - > retval ! = - ENOSPC ) )
2020-05-11 02:58:13 +03:00
{
2022-12-29 00:19:18 +03:00
// die on any error except ENOSPC
2020-05-11 02:58:13 +03:00
throw std : : runtime_error (
" local blockstore modification failed (opcode = " + std : : to_string ( bs_op - > opcode ) +
" retval = " + std : : to_string ( bs_op - > retval ) + " ) "
) ;
}
add_bs_subop_stats ( subop ) ;
2020-06-02 18:44:23 +03:00
subop - > req . hdr . opcode = bs_op_to_osd_op [ bs_op - > opcode ] ;
subop - > reply . hdr . retval = bs_op - > retval ;
2020-09-04 22:17:44 +03:00
if ( bs_op - > opcode = = BS_OP_READ | | bs_op - > opcode = = BS_OP_WRITE | | bs_op - > opcode = = BS_OP_WRITE_STABLE )
2020-06-02 18:44:23 +03:00
{
2022-12-29 00:19:18 +03:00
subop - > req . sec_rw . oid = bs_op - > oid ;
subop - > req . sec_rw . version = bs_op - > version ;
2020-06-02 18:44:23 +03:00
subop - > req . sec_rw . len = bs_op - > len ;
subop - > reply . sec_rw . version = bs_op - > version ;
}
2020-05-25 15:09:55 +03:00
delete bs_op ;
subop - > bs_op = NULL ;
2021-04-07 01:29:30 +03:00
subop - > peer_fd = - 1 ;
2020-06-02 18:44:23 +03:00
handle_primary_subop ( subop , cur_op ) ;
2020-05-11 02:58:13 +03:00
}
void osd_t : : add_bs_subop_stats ( osd_op_t * subop )
{
// Include local blockstore ops in statistics
uint64_t opcode = bs_op_to_osd_op [ subop - > bs_op - > opcode ] ;
timespec tv_end ;
clock_gettime ( CLOCK_REALTIME , & tv_end ) ;
2021-04-17 16:32:09 +03:00
msgr . stats . op_stat_count [ opcode ] + + ;
if ( ! msgr . stats . op_stat_count [ opcode ] )
2020-05-15 00:57:49 +03:00
{
2021-04-17 16:32:09 +03:00
msgr . stats . op_stat_count [ opcode ] = 1 ;
msgr . stats . op_stat_sum [ opcode ] = 0 ;
msgr . stats . op_stat_bytes [ opcode ] = 0 ;
2020-05-15 00:57:49 +03:00
}
2021-04-17 16:32:09 +03:00
msgr . stats . op_stat_sum [ opcode ] + = (
2020-05-11 02:58:13 +03:00
( tv_end . tv_sec - subop - > tv_begin . tv_sec ) * 1000000 +
( tv_end . tv_nsec - subop - > tv_begin . tv_nsec ) / 1000
) ;
2020-08-31 23:57:50 +03:00
if ( opcode = = OSD_OP_SEC_READ | | opcode = = OSD_OP_SEC_WRITE )
2020-05-11 02:58:13 +03:00
{
2021-04-17 16:32:09 +03:00
msgr . stats . op_stat_bytes [ opcode ] + = subop - > bs_op - > len ;
2020-05-11 02:58:13 +03:00
}
}
2020-06-02 18:44:23 +03:00
void osd_t : : handle_primary_subop ( osd_op_t * subop , osd_op_t * cur_op )
2020-05-03 11:04:20 +03:00
{
2020-06-02 18:44:23 +03:00
uint64_t opcode = subop - > req . hdr . opcode ;
int retval = subop - > reply . hdr . retval ;
2021-03-23 00:26:22 +03:00
int expected ;
if ( opcode = = OSD_OP_SEC_READ | | opcode = = OSD_OP_SEC_WRITE | | opcode = = OSD_OP_SEC_WRITE_STABLE )
expected = subop - > req . sec_rw . len ;
else if ( opcode = = OSD_OP_SEC_READ_BMP )
expected = subop - > req . sec_read_bmp . len / sizeof ( obj_ver_id ) * ( 8 + clean_entry_bitmap_size ) ;
else
expected = 0 ;
2020-05-03 11:04:20 +03:00
osd_primary_op_data_t * op_data = cur_op - > op_data ;
2023-04-21 02:05:19 +03:00
if ( retval = = - ENOENT & & opcode = = OSD_OP_SEC_READ )
{
// ENOENT is not an error for almost all reads, except scrub
retval = expected ;
memset ( ( ( osd_rmw_stripe_t * ) subop - > rmw_buf ) - > read_buf , 0 , expected ) ;
( ( osd_rmw_stripe_t * ) subop - > rmw_buf ) - > not_exists = true ;
}
2023-04-21 00:51:12 +03:00
if ( retval = = expected & & ( opcode = = OSD_OP_SEC_READ | | opcode = = OSD_OP_SEC_WRITE | | opcode = = OSD_OP_SEC_WRITE_STABLE ) )
{
uint64_t version = subop - > reply . sec_rw . version ;
# ifdef OSD_DEBUG
uint64_t peer_osd = msgr . clients . find ( subop - > peer_fd ) ! = msgr . clients . end ( )
? msgr . clients [ subop - > peer_fd ] - > osd_num : osd_num ;
2023-05-29 00:56:58 +03:00
printf ( " subop %s %lx:%lx from osd %lu: version = %lu \n " , osd_op_names [ opcode ] , subop - > req . sec_rw . oid . inode , subop - > req . sec_rw . oid . stripe , peer_osd , version ) ;
2023-04-21 00:51:12 +03:00
# endif
if ( op_data - > fact_ver ! = UINT64_MAX )
{
if ( op_data - > fact_ver ! = 0 & & op_data - > fact_ver ! = version )
{
fprintf (
stderr , " different fact_versions returned from %s subops: %lu vs %lu \n " ,
osd_op_names [ opcode ] , version , op_data - > fact_ver
) ;
retval = - ERANGE ;
}
else
op_data - > fact_ver = version ;
}
}
2020-05-03 11:04:20 +03:00
if ( retval ! = expected )
{
2022-05-27 02:07:26 +03:00
if ( opcode = = OSD_OP_SEC_READ | | opcode = = OSD_OP_SEC_WRITE | | opcode = = OSD_OP_SEC_WRITE_STABLE )
{
printf (
2023-02-27 02:12:55 +03:00
subop - > peer_fd > = 0
? " %1$s subop to %2$lx:%3$lx v%4$lu failed on peer %7$d: retval = %5$d (expected %6$d) \n "
: " %1$s subop to %2$lx:%3$lx v%4$lu failed locally: retval = %5$d (expected %6$d) \n " ,
2022-05-27 02:07:26 +03:00
osd_op_names [ opcode ] , subop - > req . sec_rw . oid . inode , subop - > req . sec_rw . oid . stripe , subop - > req . sec_rw . version ,
2023-02-27 02:12:55 +03:00
retval , expected , subop - > peer_fd
2022-05-27 02:07:26 +03:00
) ;
}
else
{
printf (
" %s subop failed on peer %d: retval = %d (expected %d) \n " ,
osd_op_names [ opcode ] , subop - > peer_fd , retval , expected
) ;
}
2023-02-27 02:12:55 +03:00
if ( opcode = = OSD_OP_SEC_READ & & ( retval = = - EIO | | retval = = - EDOM ) )
{
// We'll retry reads from other replica(s) on EIO/EDOM and mark object as corrupted
( ( osd_rmw_stripe_t * ) subop - > rmw_buf ) - > read_error = true ;
}
subop - > rmw_buf = NULL ;
2023-04-21 00:51:12 +03:00
// Error priority: ENOSPC and others > EIO > EDOM > EPIPE
2023-02-27 02:12:55 +03:00
if ( op_data - > errcode = = 0 | |
2023-04-21 00:51:12 +03:00
retval = = - EIO & & ( op_data - > errcode = = - EDOM | | op_data - > errcode = = - EPIPE ) | |
retval = = - EDOM & & ( op_data - > errcode = = - EPIPE ) | |
retval ! = - EIO & & retval ! = - EDOM & & retval ! = - EPIPE )
2020-05-03 11:04:20 +03:00
{
2022-12-30 02:03:22 +03:00
op_data - > errcode = retval ;
2020-05-03 11:04:20 +03:00
}
op_data - > errors + + ;
2023-04-21 00:51:12 +03:00
if ( subop - > peer_fd > = 0 & & retval ! = - EDOM & & retval ! = - ERANGE & &
2023-02-27 02:12:55 +03:00
( retval ! = - ENOSPC | | opcode ! = OSD_OP_SEC_WRITE & & opcode ! = OSD_OP_SEC_WRITE_STABLE ) & &
( retval ! = - EIO | | opcode ! = OSD_OP_SEC_READ ) )
2021-04-07 01:29:30 +03:00
{
2023-02-27 02:12:55 +03:00
// Drop connection on unexpected errors
2021-04-17 16:32:09 +03:00
msgr . stop_client ( subop - > peer_fd ) ;
2021-04-07 01:29:30 +03:00
}
2020-05-03 11:04:20 +03:00
}
else
{
2023-02-27 02:12:55 +03:00
subop - > rmw_buf = NULL ;
2020-05-03 11:04:20 +03:00
op_data - > done + + ;
}
if ( ( op_data - > errors + op_data - > done ) > = op_data - > n_subops )
{
delete [ ] op_data - > subops ;
op_data - > subops = NULL ;
op_data - > st + + ;
if ( cur_op - > req . hdr . opcode = = OSD_OP_READ )
{
continue_primary_read ( cur_op ) ;
}
else if ( cur_op - > req . hdr . opcode = = OSD_OP_WRITE )
{
continue_primary_write ( cur_op ) ;
}
else if ( cur_op - > req . hdr . opcode = = OSD_OP_SYNC )
{
continue_primary_sync ( cur_op ) ;
}
2020-05-05 00:16:01 +03:00
else if ( cur_op - > req . hdr . opcode = = OSD_OP_DELETE )
{
continue_primary_del ( cur_op ) ;
}
2023-02-13 02:59:38 +03:00
else if ( cur_op - > req . hdr . opcode = = OSD_OP_SCRUB )
{
continue_primary_scrub ( cur_op ) ;
}
2020-05-03 11:04:20 +03:00
else
{
throw std : : runtime_error ( " BUG: unknown opcode " ) ;
}
}
}
2020-06-01 01:53:32 +03:00
void osd_t : : cancel_primary_write ( osd_op_t * cur_op )
{
if ( cur_op - > op_data & & cur_op - > op_data - > subops )
{
// Primary-write operation is waiting for subops, subops
// are sent to peer OSDs, so we can't just throw them away.
// Mark them with an extra EPIPE.
cur_op - > op_data - > errors + + ;
2022-12-30 02:03:22 +03:00
if ( cur_op - > op_data - > errcode = = 0 )
cur_op - > op_data - > errcode = - EPIPE ;
2020-06-01 01:53:32 +03:00
cur_op - > op_data - > done - - ; // Caution: `done` must be signed because may become -1 here
}
else
{
finish_op ( cur_op , - EPIPE ) ;
}
}
2021-03-15 02:26:39 +03:00
bool contains_osd ( osd_num_t * osd_set , uint64_t size , osd_num_t osd_num )
2020-09-04 22:17:44 +03:00
{
for ( uint64_t i = 0 ; i < size ; i + + )
{
if ( osd_set [ i ] = = osd_num )
{
return true ;
}
}
return false ;
}
2020-09-05 17:24:27 +03:00
void osd_t : : submit_primary_del_subops ( osd_op_t * cur_op , osd_num_t * cur_set , uint64_t set_size , pg_osd_set_t & loc_set )
2020-05-03 11:04:20 +03:00
{
osd_primary_op_data_t * op_data = cur_op - > op_data ;
2020-09-04 22:17:44 +03:00
bool rep = op_data - > scheme = = POOL_SCHEME_REPLICATED ;
2021-03-15 02:26:39 +03:00
obj_ver_osd_t extra_chunks [ loc_set . size ( ) ] ;
int chunks_to_del = 0 ;
2020-05-05 00:16:01 +03:00
for ( auto & chunk : loc_set )
2020-05-03 11:04:20 +03:00
{
2021-03-15 02:26:39 +03:00
// ordered comparison for EC/XOR, unordered for replicated pools
if ( ! cur_set | | ( rep
? ! contains_osd ( cur_set , set_size , chunk . osd_num )
: ( chunk . osd_num ! = cur_set [ chunk . role ] ) ) )
2020-05-03 11:04:20 +03:00
{
2021-03-15 02:26:39 +03:00
extra_chunks [ chunks_to_del + + ] = ( obj_ver_osd_t ) {
. osd_num = chunk . osd_num ,
. oid = {
. inode = op_data - > oid . inode ,
. stripe = op_data - > oid . stripe | ( rep ? 0 : chunk . role ) ,
} ,
// Same version as write
. version = op_data - > fact_ver ,
} ;
2020-05-03 11:04:20 +03:00
}
}
2021-03-15 02:26:39 +03:00
submit_primary_del_batch ( cur_op , extra_chunks , chunks_to_del ) ;
}
void osd_t : : submit_primary_del_batch ( osd_op_t * cur_op , obj_ver_osd_t * chunks_to_delete , int chunks_to_delete_count )
{
osd_primary_op_data_t * op_data = cur_op - > op_data ;
op_data - > n_subops = chunks_to_delete_count ;
2022-12-30 02:03:22 +03:00
op_data - > done = op_data - > errors = op_data - > errcode = 0 ;
2023-04-28 00:44:27 +03:00
if ( op_data - > n_subops < = 0 )
2020-05-03 11:04:20 +03:00
{
return ;
}
2021-03-15 02:26:39 +03:00
osd_op_t * subops = new osd_op_t [ chunks_to_delete_count ] ;
2020-05-03 11:04:20 +03:00
op_data - > subops = subops ;
2021-03-15 02:26:39 +03:00
for ( int i = 0 ; i < chunks_to_delete_count ; i + + )
2020-05-03 11:04:20 +03:00
{
2021-03-15 02:26:39 +03:00
auto & chunk = chunks_to_delete [ i ] ;
if ( chunk . osd_num = = this - > osd_num )
2020-05-03 11:04:20 +03:00
{
2021-03-15 02:26:39 +03:00
clock_gettime ( CLOCK_REALTIME , & subops [ i ] . tv_begin ) ;
subops [ i ] . op_type = ( uint64_t ) cur_op ;
subops [ i ] . bs_op = new blockstore_op_t ( {
. opcode = BS_OP_DELETE ,
. callback = [ subop = & subops [ i ] , this ] ( blockstore_op_t * bs_subop )
{
handle_primary_bs_subop ( subop ) ;
} ,
2023-05-14 17:41:35 +03:00
{ {
. oid = chunk . oid ,
. version = chunk . version ,
} } ,
2021-03-15 02:26:39 +03:00
} ) ;
bs - > enqueue_op ( subops [ i ] . bs_op ) ;
}
else
{
subops [ i ] . op_type = OSD_OP_OUT ;
2021-03-23 00:59:56 +03:00
subops [ i ] . req = ( osd_any_op_t ) { . sec_del = {
2021-03-15 02:26:39 +03:00
. header = {
. magic = SECONDARY_OSD_OP_MAGIC ,
2021-04-17 16:32:09 +03:00
. id = msgr . next_subop_id + + ,
2021-03-15 02:26:39 +03:00
. opcode = OSD_OP_SEC_DELETE ,
} ,
. oid = chunk . oid ,
. version = chunk . version ,
2021-03-23 00:59:56 +03:00
} } ;
2021-03-15 02:26:39 +03:00
subops [ i ] . callback = [ cur_op , this ] ( osd_op_t * subop )
2020-05-03 11:04:20 +03:00
{
2021-03-15 02:26:39 +03:00
handle_primary_subop ( subop , cur_op ) ;
} ;
2022-02-09 10:35:29 +03:00
auto peer_fd_it = msgr . osd_peer_fds . find ( chunk . osd_num ) ;
if ( peer_fd_it ! = msgr . osd_peer_fds . end ( ) )
{
subops [ i ] . peer_fd = peer_fd_it - > second ;
msgr . outbox_push ( & subops [ i ] ) ;
}
else
{
// Fail it immediately
2022-05-27 02:07:26 +03:00
subops [ i ] . peer_fd = - 1 ;
2022-02-09 10:35:29 +03:00
subops [ i ] . reply . hdr . retval = - EPIPE ;
2023-02-07 01:59:34 +03:00
ringloop - > set_immediate ( [ subop = & subops [ i ] ] ( ) { std : : function < void ( osd_op_t * ) > ( subop - > callback ) ( subop ) ; } ) ;
2022-02-09 10:35:29 +03:00
}
2020-05-03 11:04:20 +03:00
}
}
}
2020-03-22 02:13:10 +03:00
int osd_t : : submit_primary_sync_subops ( osd_op_t * cur_op )
2020-05-03 11:04:20 +03:00
{
osd_primary_op_data_t * op_data = cur_op - > op_data ;
2020-09-06 12:08:44 +03:00
int n_osds = op_data - > dirty_osd_count ;
2020-05-03 11:04:20 +03:00
osd_op_t * subops = new osd_op_t [ n_osds ] ;
2022-12-30 02:03:22 +03:00
op_data - > done = op_data - > errors = op_data - > errcode = 0 ;
2020-05-03 11:04:20 +03:00
op_data - > n_subops = n_osds ;
op_data - > subops = subops ;
2020-03-22 02:13:10 +03:00
std : : map < uint64_t , int > : : iterator peer_it ;
2020-05-03 11:04:20 +03:00
for ( int i = 0 ; i < n_osds ; i + + )
{
2020-09-06 12:08:44 +03:00
osd_num_t sync_osd = op_data - > dirty_osds [ i ] ;
2020-05-03 11:04:20 +03:00
if ( sync_osd = = this - > osd_num )
{
2020-05-11 02:58:13 +03:00
clock_gettime ( CLOCK_REALTIME , & subops [ i ] . tv_begin ) ;
2020-05-23 14:48:54 +03:00
subops [ i ] . op_type = ( uint64_t ) cur_op ;
2020-05-03 11:04:20 +03:00
subops [ i ] . bs_op = new blockstore_op_t ( {
. opcode = BS_OP_SYNC ,
2020-05-11 02:58:13 +03:00
. callback = [ subop = & subops [ i ] , this ] ( blockstore_op_t * bs_subop )
2020-05-03 11:04:20 +03:00
{
2020-05-11 02:58:13 +03:00
handle_primary_bs_subop ( subop ) ;
2020-05-03 11:04:20 +03:00
} ,
} ) ;
bs - > enqueue_op ( subops [ i ] . bs_op ) ;
}
2021-04-17 16:32:09 +03:00
else if ( ( peer_it = msgr . osd_peer_fds . find ( sync_osd ) ) ! = msgr . osd_peer_fds . end ( ) )
2020-05-03 11:04:20 +03:00
{
subops [ i ] . op_type = OSD_OP_OUT ;
2020-03-22 02:13:10 +03:00
subops [ i ] . peer_fd = peer_it - > second ;
2021-03-23 00:59:56 +03:00
subops [ i ] . req = ( osd_any_op_t ) { . sec_sync = {
2020-05-03 11:04:20 +03:00
. header = {
. magic = SECONDARY_OSD_OP_MAGIC ,
2021-04-17 16:32:09 +03:00
. id = msgr . next_subop_id + + ,
2020-08-31 23:57:50 +03:00
. opcode = OSD_OP_SEC_SYNC ,
2020-05-03 11:04:20 +03:00
} ,
2021-03-23 00:59:56 +03:00
} } ;
2020-05-03 11:04:20 +03:00
subops [ i ] . callback = [ cur_op , this ] ( osd_op_t * subop )
{
2020-06-02 18:44:23 +03:00
handle_primary_subop ( subop , cur_op ) ;
2020-05-03 11:04:20 +03:00
} ;
2021-04-17 16:32:09 +03:00
msgr . outbox_push ( & subops [ i ] ) ;
2020-05-03 11:04:20 +03:00
}
2020-03-22 02:13:10 +03:00
else
{
op_data - > done + + ;
}
}
if ( op_data - > done > = op_data - > n_subops )
{
delete [ ] op_data - > subops ;
op_data - > subops = NULL ;
return 0 ;
2020-05-03 11:04:20 +03:00
}
2020-03-22 02:13:10 +03:00
return 1 ;
2020-05-03 11:04:20 +03:00
}
void osd_t : : submit_primary_stab_subops ( osd_op_t * cur_op )
{
osd_primary_op_data_t * op_data = cur_op - > op_data ;
int n_osds = op_data - > unstable_write_osds - > size ( ) ;
osd_op_t * subops = new osd_op_t [ n_osds ] ;
2022-12-30 02:03:22 +03:00
op_data - > done = op_data - > errors = op_data - > errcode = 0 ;
2020-05-03 11:04:20 +03:00
op_data - > n_subops = n_osds ;
op_data - > subops = subops ;
for ( int i = 0 ; i < n_osds ; i + + )
{
auto & stab_osd = ( * ( op_data - > unstable_write_osds ) ) [ i ] ;
if ( stab_osd . osd_num = = this - > osd_num )
{
2020-05-11 02:58:13 +03:00
clock_gettime ( CLOCK_REALTIME , & subops [ i ] . tv_begin ) ;
2020-05-23 14:48:54 +03:00
subops [ i ] . op_type = ( uint64_t ) cur_op ;
2020-11-09 00:07:07 +03:00
subops [ i ] . bs_op = new blockstore_op_t ( ( blockstore_op_t ) {
2020-05-03 11:04:20 +03:00
. opcode = BS_OP_STABLE ,
2020-05-11 02:58:13 +03:00
. callback = [ subop = & subops [ i ] , this ] ( blockstore_op_t * bs_subop )
2020-05-03 11:04:20 +03:00
{
2020-05-11 02:58:13 +03:00
handle_primary_bs_subop ( subop ) ;
2020-05-03 11:04:20 +03:00
} ,
2023-02-15 00:55:40 +03:00
{
. len = ( uint32_t ) stab_osd . len ,
} ,
2020-05-03 11:04:20 +03:00
. buf = ( void * ) ( op_data - > unstable_writes + stab_osd . start ) ,
} ) ;
bs - > enqueue_op ( subops [ i ] . bs_op ) ;
}
else
{
subops [ i ] . op_type = OSD_OP_OUT ;
2021-03-23 00:59:56 +03:00
subops [ i ] . req = ( osd_any_op_t ) { . sec_stab = {
2020-05-03 11:04:20 +03:00
. header = {
. magic = SECONDARY_OSD_OP_MAGIC ,
2021-04-17 16:32:09 +03:00
. id = msgr . next_subop_id + + ,
2020-08-31 23:57:50 +03:00
. opcode = OSD_OP_SEC_STABILIZE ,
2020-05-03 11:04:20 +03:00
} ,
. len = ( uint64_t ) ( stab_osd . len * sizeof ( obj_ver_id ) ) ,
2021-03-23 00:59:56 +03:00
} } ;
2020-06-18 02:07:20 +03:00
subops [ i ] . iov . push_back ( op_data - > unstable_writes + stab_osd . start , stab_osd . len * sizeof ( obj_ver_id ) ) ;
2020-05-03 11:04:20 +03:00
subops [ i ] . callback = [ cur_op , this ] ( osd_op_t * subop )
{
2020-06-02 18:44:23 +03:00
handle_primary_subop ( subop , cur_op ) ;
2020-05-03 11:04:20 +03:00
} ;
2022-02-09 10:35:29 +03:00
auto peer_fd_it = msgr . osd_peer_fds . find ( stab_osd . osd_num ) ;
if ( peer_fd_it ! = msgr . osd_peer_fds . end ( ) )
{
subops [ i ] . peer_fd = peer_fd_it - > second ;
msgr . outbox_push ( & subops [ i ] ) ;
}
else
{
// Fail it immediately
2022-05-27 02:07:26 +03:00
subops [ i ] . peer_fd = - 1 ;
2022-02-09 10:35:29 +03:00
subops [ i ] . reply . hdr . retval = - EPIPE ;
2023-02-07 01:59:34 +03:00
ringloop - > set_immediate ( [ subop = & subops [ i ] ] ( ) { std : : function < void ( osd_op_t * ) > ( subop - > callback ) ( subop ) ; } ) ;
2022-02-09 10:35:29 +03:00
}
2020-05-03 11:04:20 +03:00
}
}
}
2020-06-01 01:53:32 +03:00
void osd_t : : pg_cancel_write_queue ( pg_t & pg , osd_op_t * first_op , object_id oid , int retval )
2020-05-03 11:04:20 +03:00
{
auto st_it = pg . write_queue . find ( oid ) , it = st_it ;
2020-10-21 00:56:01 +03:00
if ( it = = pg . write_queue . end ( ) | | it - > second ! = first_op )
2020-06-01 01:53:32 +03:00
{
// Write queue doesn't match the first operation.
// first_op is a leftover operation from the previous peering of the same PG.
2020-10-21 00:56:01 +03:00
finish_op ( first_op , retval ) ;
2020-06-01 01:53:32 +03:00
return ;
}
2020-10-21 00:56:01 +03:00
std : : vector < osd_op_t * > cancel_ops ;
2021-04-07 01:29:30 +03:00
while ( it ! = pg . write_queue . end ( ) & & it - > first = = oid )
2020-05-03 11:04:20 +03:00
{
2020-10-21 00:56:01 +03:00
cancel_ops . push_back ( it - > second ) ;
2020-05-03 11:04:20 +03:00
it + + ;
}
if ( st_it ! = it )
{
2020-10-21 00:56:01 +03:00
// First erase them and then run finish_op() for the sake of reenterability
// Calling finish_op() on a live iterator previously triggered a bug where some
// of the OSDs were looping infinitely if you stopped all of them with kill -INT during recovery
2020-05-03 11:04:20 +03:00
pg . write_queue . erase ( st_it , it ) ;
2020-10-21 00:56:01 +03:00
for ( auto op : cancel_ops )
{
finish_op ( op , retval ) ;
}
2020-05-03 11:04:20 +03:00
}
}