diff --git a/src/osd.h b/src/osd.h index 59f1db10..f14cc8b1 100644 --- a/src/osd.h +++ b/src/osd.h @@ -240,7 +240,8 @@ class osd_t void continue_primary_sync(osd_op_t *cur_op); void continue_primary_del(osd_op_t *cur_op); bool check_write_queue(osd_op_t *cur_op, pg_t & pg); - void remove_object_from_state(object_id & oid, pg_osd_set_state_t **object_state, pg_t &pg); + void remove_object_from_state(object_id & oid, pg_osd_set_state_t **object_state, pg_t &pg, bool report = true); + pg_osd_set_state_t *mark_object_corrupted(pg_t & pg, object_id oid, pg_osd_set_state_t *prev_object_state, osd_rmw_stripe_t *stripes, bool ref); void deref_object_state(pg_t & pg, pg_osd_set_state_t **object_state, bool deref); bool remember_unstable_write(osd_op_t *cur_op, pg_t & pg, pg_osd_set_t & loc_set, int base_state); void handle_primary_subop(osd_op_t *subop, osd_op_t *cur_op); @@ -256,10 +257,11 @@ class osd_t int submit_primary_sync_subops(osd_op_t *cur_op); void submit_primary_stab_subops(osd_op_t *cur_op); - uint64_t* get_object_osd_set(pg_t &pg, object_id &oid, uint64_t *def, pg_osd_set_state_t **object_state); + uint64_t* get_object_osd_set(pg_t &pg, object_id &oid, pg_osd_set_state_t **object_state); void continue_chained_read(osd_op_t *cur_op); int submit_chained_read_requests(pg_t & pg, osd_op_t *cur_op); + void check_corrupted_chained(pg_t & pg, osd_op_t *cur_op); void send_chained_read_results(pg_t & pg, osd_op_t *cur_op); std::vector collect_chained_read_requests(osd_op_t *cur_op); int collect_bitmap_requests(osd_op_t *cur_op, pg_t & pg, std::vector & bitmap_requests); diff --git a/src/osd_flush.cpp b/src/osd_flush.cpp index d4317760..79a9bf26 100644 --- a/src/osd_flush.cpp +++ b/src/osd_flush.cpp @@ -310,20 +310,17 @@ void osd_t::submit_recovery_op(osd_recovery_op_t *op) if (osd_op->reply.hdr.retval < 0) { // Error recovering object - if (osd_op->reply.hdr.retval == -EPIPE) - { - // PG is stopped or one of the OSDs is gone, error is harmless - printf( - "[PG %u/%u] Recovery operation failed with object %lx:%lx\n", - INODE_POOL(op->oid.inode), - map_to_pg(op->oid, st_cli.pool_config.at(INODE_POOL(op->oid.inode)).pg_stripe_size), - op->oid.inode, op->oid.stripe - ); - } - else - { - throw std::runtime_error("Failed to recover an object"); - } + // EPIPE is totally harmless (peer is gone), others like EIO/EDOM may be not + printf( + "[PG %u/%u] Recovery operation failed with object %lx:%lx: error %ld\n", + INODE_POOL(op->oid.inode), + map_to_pg(op->oid, st_cli.pool_config.at(INODE_POOL(op->oid.inode)).pg_stripe_size), + op->oid.inode, op->oid.stripe, osd_op->reply.hdr.retval + ); + } + else if (log_level > 2) + { + printf("Recovery operation done for %lx:%lx\n", op->oid.inode, op->oid.stripe); } // CAREFUL! op = &recovery_ops[op->oid]. Don't access op->* after recovery_ops.erase() op->osd_op = NULL; diff --git a/src/osd_peering_pg.cpp b/src/osd_peering_pg.cpp index 489b6a66..bbdfec2e 100644 --- a/src/osd_peering_pg.cpp +++ b/src/osd_peering_pg.cpp @@ -336,7 +336,7 @@ pg_osd_set_state_t* pg_t::add_object_to_state(const object_id oid, const uint64_ { for (auto & o: osd_set) { - if (!(o.loc_bad & LOC_OUTDATED)) + if (!o.loc_bad) { read_target.push_back(o.osd_num); } diff --git a/src/osd_primary.cpp b/src/osd_primary.cpp index 80fe9c99..06f964e4 100644 --- a/src/osd_primary.cpp +++ b/src/osd_primary.cpp @@ -90,6 +90,8 @@ bool osd_t::prepare_primary_rw(osd_op_t *cur_op) chain_size * ( // - copy of the chain sizeof(inode_t) + + // - object states for every chain item + sizeof(void*) + // - bitmap buffers for chained read stripe_count * clean_entry_bitmap_size + // - 'missing' flags for chained reads @@ -117,6 +119,8 @@ bool osd_t::prepare_primary_rw(osd_op_t *cur_op) { op_data->read_chain = (inode_t*)data_buf; data_buf = (uint8_t*)data_buf + sizeof(inode_t) * chain_size; + op_data->chain_states = (pg_osd_set_state_t**)data_buf; + data_buf = (uint8_t*)data_buf + sizeof(pg_osd_set_state_t*) * chain_size; op_data->snapshot_bitmaps = data_buf; data_buf = (uint8_t*)data_buf + chain_size * stripe_count * clean_entry_bitmap_size; op_data->missing_flags = (uint8_t*)data_buf; @@ -131,6 +135,7 @@ bool osd_t::prepare_primary_rw(osd_op_t *cur_op) inode_it->second.parent_id != cur_op->req.rw.inode) { op_data->read_chain[chain_num++] = inode_it->second.parent_id; + op_data->chain_states[chain_num++] = NULL; inode_it = st_cli.inode_config.find(inode_it->second.parent_id); } } @@ -138,12 +143,12 @@ bool osd_t::prepare_primary_rw(osd_op_t *cur_op) return true; } -uint64_t* osd_t::get_object_osd_set(pg_t &pg, object_id &oid, uint64_t *def, pg_osd_set_state_t **object_state) +uint64_t* osd_t::get_object_osd_set(pg_t &pg, object_id &oid, pg_osd_set_state_t **object_state) { if (!(pg.state & (PG_HAS_INCOMPLETE | PG_HAS_DEGRADED | PG_HAS_MISPLACED))) { *object_state = NULL; - return def; + return pg.cur_set.data(); } auto st_it = pg.incomplete_objects.find(oid); if (st_it != pg.incomplete_objects.end()) @@ -164,7 +169,7 @@ uint64_t* osd_t::get_object_osd_set(pg_t &pg, object_id &oid, uint64_t *def, pg_ return st_it->second->read_target.data(); } *object_state = NULL; - return def; + return pg.cur_set.data(); } void osd_t::continue_primary_read(osd_op_t *cur_op) @@ -183,6 +188,7 @@ void osd_t::continue_primary_read(osd_op_t *cur_op) goto resume_1; else if (op_data->st == 2) goto resume_2; +resume_0: cur_op->reply.rw.bitmap_len = 0; { auto & pg = pgs.at({ .pool_id = INODE_POOL(op_data->oid.inode), .pg_num = op_data->pg_num }); @@ -206,15 +212,17 @@ void osd_t::continue_primary_read(osd_op_t *cur_op) // Determine version auto vo_it = pg.ver_override.find(op_data->oid); op_data->target_ver = vo_it != pg.ver_override.end() ? vo_it->second : UINT64_MAX; - op_data->prev_set = pg.cur_set.data(); - if (pg.state != PG_ACTIVE) - { - // PG may be degraded or have misplaced objects - op_data->prev_set = get_object_osd_set(pg, op_data->oid, pg.cur_set.data(), &op_data->object_state); - } + // PG may have degraded or misplaced objects + op_data->prev_set = get_object_osd_set(pg, op_data->oid, &op_data->object_state); if (pg.state == PG_ACTIVE || op_data->scheme == POOL_SCHEME_REPLICATED) { // Fast happy-path + if (op_data->scheme == POOL_SCHEME_REPLICATED && + op_data->object_state && (op_data->object_state->state & OBJ_INCOMPLETE)) + { + finish_op(cur_op, -EIO); + return; + } cur_op->buf = alloc_read_buffer(op_data->stripes, op_data->pg_data_size, 0); submit_primary_subops(SUBMIT_RMW_READ, op_data->target_ver, op_data->prev_set, cur_op); op_data->st = 1; @@ -240,6 +248,14 @@ resume_1: resume_2: if (op_data->errors > 0) { + if (op_data->errcode == -EIO || op_data->errcode == -EDOM) + { + // I/O or checksum error + auto & pg = pgs.at({ .pool_id = INODE_POOL(op_data->oid.inode), .pg_num = op_data->pg_num }); + // FIXME: ref = true ideally... because new_state != state is not necessarily true if it's freed and recreated + op_data->object_state = mark_object_corrupted(pg, op_data->oid, op_data->object_state, op_data->stripes, false); + goto resume_0; + } finish_op(cur_op, op_data->errcode); return; } @@ -278,15 +294,129 @@ resume_2: finish_op(cur_op, cur_op->req.rw.len); } +pg_osd_set_state_t *osd_t::mark_object_corrupted(pg_t & pg, object_id oid, pg_osd_set_state_t *prev_object_state, osd_rmw_stripe_t *stripes, bool ref) +{ + pg_osd_set_state_t *object_state = NULL; + get_object_osd_set(pg, oid, &object_state); + if (prev_object_state != object_state) + { + // Object state changed in between by a parallel I/O operation, skip marking as failed + if (ref) + { + deref_object_state(pg, &prev_object_state, ref); + if (object_state) + object_state->ref_count++; + } + return object_state; + } + pg_osd_set_t corrupted_set; + if (object_state) + { + corrupted_set = object_state->osd_set; + } + else + { + for (int i = 0; i < pg.cur_set.size(); i++) + { + corrupted_set.push_back((pg_obj_loc_t){ + .role = (pg.scheme == POOL_SCHEME_REPLICATED ? 0 : (uint64_t)i), + .osd_num = pg.cur_set[i], + }); + } + } + // Mark object chunk(s) as corrupted + uint64_t has_roles = 0, n_roles = 0, n_copies = 0, n_corrupted = 0; + for (auto & chunk: corrupted_set) + { + bool corrupted = stripes[chunk.role].osd_num == chunk.osd_num && stripes[chunk.role].read_error; + if (corrupted && !(chunk.loc_bad & LOC_CORRUPTED)) + n_corrupted++; + chunk.loc_bad = chunk.loc_bad | (corrupted ? LOC_CORRUPTED : 0); + if (!chunk.loc_bad) + { + if (pg.scheme == POOL_SCHEME_REPLICATED) + n_roles = 1; + else if (!(has_roles & (1 << chunk.role))) + { + n_roles++; + has_roles |= (1 << chunk.role); + } + n_copies++; + } + } + if (!n_corrupted) + { + // No chunks newly marked as corrupted - object is already marked or moved + return object_state; + } + int old_pg_state = pg.state; + if (object_state) + { + remove_object_from_state(oid, &object_state, pg, false); + deref_object_state(pg, &object_state, ref); + } + // Calculate object state + uint64_t obj_state = OBJ_CORRUPTED; + int pg_state_bits = PG_HAS_CORRUPTED; + this->corrupted_objects++; + pg.corrupted_count++; + if (log_level > 1) + { + printf("Marking object %lx:%lx corrupted: %lu chunks / %lu copies available, %lu corrupted\n", + oid.inode, oid.stripe, n_roles, n_copies, n_corrupted); + } + if (n_roles < pg.pg_data_size) + { + this->incomplete_objects++; + obj_state |= OBJ_INCOMPLETE; + pg_state_bits = PG_HAS_INCOMPLETE; + } + else if (n_roles < pg.pg_cursize) + { + this->degraded_objects++; + obj_state |= OBJ_DEGRADED; + pg_state_bits = PG_HAS_DEGRADED; + } + else + { + this->misplaced_objects++; + obj_state |= OBJ_MISPLACED; + pg_state_bits = PG_HAS_MISPLACED; + } + pg.state |= pg_state_bits; + if (pg.state != old_pg_state) + { + report_pg_state(pg); + if ((pg.state & (PG_HAS_DEGRADED | PG_HAS_MISPLACED)) != + (old_pg_state & (PG_HAS_DEGRADED | PG_HAS_MISPLACED))) + { + peering_state = peering_state | OSD_RECOVERING; + if ((pg.state & PG_HAS_DEGRADED) != (old_pg_state & PG_HAS_DEGRADED)) + { + // Restart recovery from degraded objects + recovery_last_degraded = true; + recovery_last_pg = {}; + recovery_last_oid = {}; + } + ringloop->wakeup(); + } + } + // Insert object into the new state and retry + object_state = pg.add_object_to_state(oid, obj_state, corrupted_set); + if (ref) + object_state->ref_count++; + return object_state; +} + // Decrement pg_osd_set_state_t's object_count and change PG state accordingly -void osd_t::remove_object_from_state(object_id & oid, pg_osd_set_state_t **object_state, pg_t & pg) +void osd_t::remove_object_from_state(object_id & oid, pg_osd_set_state_t **object_state, pg_t & pg, bool report) { if (!*object_state) { return; } pg_osd_set_state_t *recheck_state = NULL; - get_object_osd_set(pg, oid, NULL, &recheck_state); + get_object_osd_set(pg, oid, &recheck_state); if (recheck_state != *object_state) { recheck_state->ref_count++; @@ -295,6 +425,12 @@ void osd_t::remove_object_from_state(object_id & oid, pg_osd_set_state_t **objec return; } (*object_state)->object_count--; + if ((*object_state)->state & OBJ_CORRUPTED) + { + this->corrupted_objects--; + pg.corrupted_count--; + } + bool changed = false; if ((*object_state)->state & OBJ_INCOMPLETE) { // Successful write means that object is not incomplete anymore @@ -303,7 +439,7 @@ void osd_t::remove_object_from_state(object_id & oid, pg_osd_set_state_t **objec if (!pg.incomplete_objects.size()) { pg.state = pg.state & ~PG_HAS_INCOMPLETE; - report_pg_state(pg); + changed = true; } } else if ((*object_state)->state & OBJ_DEGRADED) @@ -313,7 +449,7 @@ void osd_t::remove_object_from_state(object_id & oid, pg_osd_set_state_t **objec if (!pg.degraded_objects.size()) { pg.state = pg.state & ~PG_HAS_DEGRADED; - report_pg_state(pg); + changed = true; } } else if ((*object_state)->state & OBJ_MISPLACED) @@ -323,13 +459,17 @@ void osd_t::remove_object_from_state(object_id & oid, pg_osd_set_state_t **objec if (!pg.misplaced_objects.size()) { pg.state = pg.state & ~PG_HAS_MISPLACED; - report_pg_state(pg); + changed = true; } } else { throw std::runtime_error("BUG: Invalid object state: "+std::to_string((*object_state)->state)); } + if (changed && report) + { + report_pg_state(pg); + } } void osd_t::deref_object_state(pg_t & pg, pg_osd_set_state_t **object_state, bool deref) @@ -374,13 +514,14 @@ void osd_t::continue_primary_del(osd_op_t *cur_op) } resume_1: // Determine which OSDs contain this object and delete it - op_data->prev_set = get_object_osd_set(pg, op_data->oid, pg.cur_set.data(), &op_data->object_state); + op_data->prev_set = get_object_osd_set(pg, op_data->oid, &op_data->object_state); if (op_data->object_state) { op_data->object_state->ref_count++; } // Submit 1 read to determine the actual version number submit_primary_subops(SUBMIT_RMW_READ, UINT64_MAX, op_data->prev_set, cur_op); + op_data->prev_set = NULL; resume_2: op_data->st = 2; return; diff --git a/src/osd_primary.h b/src/osd_primary.h index ce667468..b0270437 100644 --- a/src/osd_primary.h +++ b/src/osd_primary.h @@ -50,6 +50,7 @@ struct osd_primary_op_data_t // for read_bitmaps void *snapshot_bitmaps; inode_t *read_chain; + pg_osd_set_state_t **chain_states; uint8_t *missing_flags; int chain_size; osd_chain_read_t *chain_reads; diff --git a/src/osd_primary_chain.cpp b/src/osd_primary_chain.cpp index 01fc39ed..5713c0ca 100644 --- a/src/osd_primary_chain.cpp +++ b/src/osd_primary_chain.cpp @@ -40,10 +40,24 @@ resume_3: resume_4: if (op_data->errors > 0) { - free(op_data->chain_reads); - op_data->chain_reads = NULL; - finish_op(cur_op, op_data->errcode); - return; + if (op_data->errcode == -EIO || op_data->errcode == -EDOM) + { + // Handle corrupted reads and retry... + check_corrupted_chained(pg, cur_op); + free(cur_op->buf); + cur_op->buf = NULL; + free(op_data->chain_reads); + op_data->chain_reads = NULL; + // FIXME: We can in theory retry only specific parts instead of the whole operation + goto resume_1; + } + else + { + free(op_data->chain_reads); + op_data->chain_reads = NULL; + finish_op(cur_op, op_data->errcode); + return; + } } send_chained_read_results(pg, cur_op); finish_op(cur_op, cur_op->req.rw.len); @@ -131,8 +145,7 @@ int osd_t::collect_bitmap_requests(osd_op_t *cur_op, pg_t & pg, std::vectorread_chain[chain_num], .stripe = op_data->oid.stripe }; auto vo_it = pg.ver_override.find(cur_oid); uint64_t target_version = vo_it != pg.ver_override.end() ? vo_it->second : UINT64_MAX; - pg_osd_set_state_t *object_state; - uint64_t* cur_set = get_object_osd_set(pg, cur_oid, pg.cur_set.data(), &object_state); + uint64_t* cur_set = get_object_osd_set(pg, cur_oid, &op_data->chain_states[chain_num]); if (pg.scheme == POOL_SCHEME_REPLICATED) { osd_num_t read_target = 0; @@ -247,6 +260,7 @@ int osd_t::submit_bitmap_subops(osd_op_t *cur_op, pg_t & pg) osd_op_t *subop = op_data->subops+subop_idx; subop->op_type = OSD_OP_OUT; // FIXME: Use the pre-allocated buffer + assert(!subop->buf); subop->buf = malloc_or_die(sizeof(obj_ver_id)*(i+1-prev)); subop->req = (osd_any_op_t){ .sec_read_bmp = { @@ -375,6 +389,8 @@ int osd_t::submit_chained_read_requests(pg_t & pg, osd_op_t *cur_op) op_data->chain_read_count = chain_reads.size(); op_data->chain_reads = (osd_chain_read_t*)calloc_or_die( 1, sizeof(osd_chain_read_t) * chain_reads.size() + // FIXME: Allocate only instead of stripes + // (but it's slightly harder to handle in send_chained_read_results()) + sizeof(osd_rmw_stripe_t) * stripe_count * op_data->chain_size ); osd_rmw_stripe_t *chain_stripes = (osd_rmw_stripe_t*)( @@ -403,8 +419,7 @@ int osd_t::submit_chained_read_requests(pg_t & pg, osd_op_t *cur_op) uint64_t *cur_set = pg.cur_set.data(); if (pg.state != PG_ACTIVE) { - pg_osd_set_state_t *object_state; - cur_set = get_object_osd_set(pg, cur_oid, pg.cur_set.data(), &object_state); + cur_set = get_object_osd_set(pg, cur_oid, &op_data->chain_states[chain_reads[cri].chain_pos]); if (op_data->scheme != POOL_SCHEME_REPLICATED) { if (extend_missing_stripes(stripes, cur_set, pg.pg_data_size, pg.pg_size) < 0) @@ -416,6 +431,17 @@ int osd_t::submit_chained_read_requests(pg_t & pg, osd_op_t *cur_op) } op_data->degraded = 1; } + else + { + auto cur_state = op_data->chain_states[chain_reads[cri].chain_pos]; + if (cur_state && (cur_state->state & OBJ_INCOMPLETE)) + { + free(op_data->chain_reads); + op_data->chain_reads = NULL; + finish_op(cur_op, -EIO); + return -1; + } + } } if (op_data->scheme == POOL_SCHEME_REPLICATED) { @@ -433,6 +459,7 @@ int osd_t::submit_chained_read_requests(pg_t & pg, osd_op_t *cur_op) } } } + assert(!cur_op->buf); cur_op->buf = memalign_or_die(MEM_ALIGNMENT, read_buffer_size); void *cur_buf = cur_op->buf; for (int cri = 0; cri < chain_reads.size(); cri++) @@ -468,12 +495,8 @@ int osd_t::submit_chained_read_requests(pg_t & pg, osd_op_t *cur_op) object_id cur_oid = { .inode = chain_reads[cri].inode, .stripe = op_data->oid.stripe }; auto vo_it = pg.ver_override.find(cur_oid); uint64_t target_ver = vo_it != pg.ver_override.end() ? vo_it->second : UINT64_MAX; - uint64_t *cur_set = pg.cur_set.data(); - if (pg.state != PG_ACTIVE) - { - pg_osd_set_state_t *object_state; - cur_set = get_object_osd_set(pg, cur_oid, pg.cur_set.data(), &object_state); - } + auto cur_state = op_data->chain_states[chain_reads[cri].chain_pos]; + uint64_t *cur_set = (pg.state != PG_ACTIVE && cur_state ? cur_state->read_target.data() : pg.cur_set.data()); int zero_read = -1; if (op_data->scheme == POOL_SCHEME_REPLICATED) { @@ -487,6 +510,33 @@ int osd_t::submit_chained_read_requests(pg_t & pg, osd_op_t *cur_op) return 0; } +void osd_t::check_corrupted_chained(pg_t & pg, osd_op_t *cur_op) +{ + osd_primary_op_data_t *op_data = cur_op->op_data; + int stripe_count = (pg.scheme == POOL_SCHEME_REPLICATED ? 1 : pg.pg_size); + osd_rmw_stripe_t *chain_stripes = (osd_rmw_stripe_t*)( + (uint8_t*)op_data->chain_reads + sizeof(osd_chain_read_t) * op_data->chain_read_count + ); + for (int cri = 0; cri < op_data->chain_read_count; cri++) + { + object_id cur_oid = { .inode = op_data->chain_reads[cri].inode, .stripe = op_data->oid.stripe }; + osd_rmw_stripe_t *stripes = chain_stripes + op_data->chain_reads[cri].chain_pos*stripe_count; + bool corrupted = false; + for (int i = 0; i < stripe_count; i++) + { + if (stripes[i].read_error) + { + corrupted = true; + break; + } + } + if (corrupted) + { + mark_object_corrupted(pg, cur_oid, op_data->chain_states[op_data->chain_reads[cri].chain_pos], stripes, false); + } + } +} + void osd_t::send_chained_read_results(pg_t & pg, osd_op_t *cur_op) { osd_primary_op_data_t *op_data = cur_op->op_data; diff --git a/src/osd_primary_subops.cpp b/src/osd_primary_subops.cpp index da8520a0..31c34dcb 100644 --- a/src/osd_primary_subops.cpp +++ b/src/osd_primary_subops.cpp @@ -147,9 +147,9 @@ int osd_t::submit_primary_subop_batch(int submit_type, inode_t inode, uint64_t o continue; } osd_num_t role_osd_num = osd_set[role]; + int stripe_num = rep ? 0 : role; if (role_osd_num != 0) { - int stripe_num = rep ? 0 : role; osd_op_t *subop = op_data->subops + i; uint32_t subop_len = wr ? stripes[stripe_num].write_end - stripes[stripe_num].write_start @@ -158,12 +158,16 @@ int osd_t::submit_primary_subop_batch(int submit_type, inode_t inode, uint64_t o { subop_len = 0; } + stripes[stripe_num].osd_num = role_osd_num; + stripes[stripe_num].read_error = false; + subop->bitmap = stripes[stripe_num].bmp_buf; + subop->bitmap_len = clean_entry_bitmap_size; + // Using rmw_buf to pass pointer to stripes. Dirty but should work + subop->rmw_buf = stripes+stripe_num; if (role_osd_num == this->osd_num) { clock_gettime(CLOCK_REALTIME, &subop->tv_begin); subop->op_type = (uint64_t)cur_op; - subop->bitmap = stripes[stripe_num].bmp_buf; - subop->bitmap_len = clean_entry_bitmap_size; subop->bs_op = new blockstore_op_t({ .opcode = (uint64_t)(wr ? (rep ? BS_OP_WRITE_STABLE : BS_OP_WRITE) : BS_OP_READ), .callback = [subop, this](blockstore_op_t *bs_subop) @@ -192,8 +196,6 @@ int osd_t::submit_primary_subop_batch(int submit_type, inode_t inode, uint64_t o else { subop->op_type = OSD_OP_OUT; - subop->bitmap = stripes[stripe_num].bmp_buf; - subop->bitmap_len = clean_entry_bitmap_size; subop->req.sec_rw = { .header = { .magic = SECONDARY_OSD_OP_MAGIC, @@ -250,6 +252,10 @@ int osd_t::submit_primary_subop_batch(int submit_type, inode_t inode, uint64_t o } i++; } + else + { + stripes[stripe_num].osd_num = 0; + } } return i-subop_idx; } @@ -339,9 +345,11 @@ void osd_t::handle_primary_subop(osd_op_t *subop, osd_op_t *cur_op) if (opcode == OSD_OP_SEC_READ || opcode == OSD_OP_SEC_WRITE || opcode == OSD_OP_SEC_WRITE_STABLE) { printf( - "%s subop to %lx:%lx v%lu failed on peer %d: retval = %d (expected %d)\n", + subop->peer_fd >= 0 + ? "%1$s subop to %2$lx:%3$lx v%4$lu failed on peer %7$d: retval = %5$d (expected %6$d)\n" + : "%1$s subop to %2$lx:%3$lx v%4$lu failed locally: retval = %5$d (expected %6$d)\n", osd_op_names[opcode], subop->req.sec_rw.oid.inode, subop->req.sec_rw.oid.stripe, subop->req.sec_rw.version, - subop->peer_fd, retval, expected + retval, expected, subop->peer_fd ); } else @@ -351,22 +359,32 @@ void osd_t::handle_primary_subop(osd_op_t *subop, osd_op_t *cur_op) osd_op_names[opcode], subop->peer_fd, retval, expected ); } - // Error priority: EIO > ENOSPC > EPIPE - if (op_data->errcode == 0 || retval == -EIO || + if (opcode == OSD_OP_SEC_READ && (retval == -EIO || retval == -EDOM)) + { + // We'll retry reads from other replica(s) on EIO/EDOM and mark object as corrupted + ((osd_rmw_stripe_t*)subop->rmw_buf)->read_error = true; + } + subop->rmw_buf = NULL; + // Error priority: EIO > EDOM > ENOSPC > EPIPE + if (op_data->errcode == 0 || + retval == -EIO || + retval == -EDOM && (op_data->errcode == -ENOSPC || op_data->errcode == -EPIPE) || retval == -ENOSPC && op_data->errcode == -EPIPE) { op_data->errcode = retval; } op_data->errors++; - if (subop->peer_fd >= 0 && (opcode != OSD_OP_SEC_WRITE && opcode != OSD_OP_SEC_WRITE_STABLE || - retval != -ENOSPC)) + if (subop->peer_fd >= 0 && retval != -EDOM && + (retval != -ENOSPC || opcode != OSD_OP_SEC_WRITE && opcode != OSD_OP_SEC_WRITE_STABLE) && + (retval != -EIO || opcode != OSD_OP_SEC_READ)) { - // Drop connection on any error expect ENOSPC + // Drop connection on unexpected errors msgr.stop_client(subop->peer_fd); } } else { + subop->rmw_buf = NULL; op_data->done++; if (opcode == OSD_OP_SEC_READ || opcode == OSD_OP_SEC_WRITE || opcode == OSD_OP_SEC_WRITE_STABLE) { diff --git a/src/osd_primary_write.cpp b/src/osd_primary_write.cpp index 222fd2b7..5ef322f4 100644 --- a/src/osd_primary_write.cpp +++ b/src/osd_primary_write.cpp @@ -58,12 +58,13 @@ resume_1: // Determine blocks to read and write // Missing chunks are allowed to be overwritten even in incomplete objects // FIXME: Allow to do small writes to the old (degraded/misplaced) OSD set for lower performance impact - op_data->prev_set = get_object_osd_set(pg, op_data->oid, pg.cur_set.data(), &op_data->object_state); + op_data->prev_set = get_object_osd_set(pg, op_data->oid, &op_data->object_state); if (op_data->object_state) { // Protect object_state from being freed by a parallel read operation changing it op_data->object_state->ref_count++; } +retry_1: if (op_data->scheme == POOL_SCHEME_REPLICATED) { // Simplified algorithm @@ -73,6 +74,12 @@ resume_1: if (pg.cur_set.data() != op_data->prev_set && (op_data->stripes[0].write_start != 0 || op_data->stripes[0].write_end != bs_block_size)) { + if (op_data->object_state->state & OBJ_INCOMPLETE) + { + // Refuse partial overwrite of an incomplete (corrupted) object + cur_op->reply.hdr.retval = -EIO; + goto continue_others; + } // Object is degraded/misplaced and will be moved to op_data->stripes[0].read_start = 0; op_data->stripes[0].read_end = bs_block_size; @@ -91,13 +98,53 @@ resume_1: } } // Read required blocks - submit_primary_subops(SUBMIT_RMW_READ, UINT64_MAX, op_data->prev_set, cur_op); + { + if (op_data->object_state && (op_data->object_state->state & OBJ_INCOMPLETE)) + { + // Allow to read version number (just version number!) from corrupted chunks + // to allow full overwrite of a corrupted object + bool found = false; + for (int role = 0; role < op_data->pg_size; role++) + { + if (op_data->prev_set[role] != 0 || op_data->stripes[role].read_end > op_data->stripes[role].read_start) + { + found = true; + break; + } + } + if (!found) + { + osd_num_t corrupted_target[op_data->pg_size]; + for (int role = 0; role < op_data->pg_size; role++) + { + corrupted_target[role] = 0; + } + for (auto & loc: op_data->object_state->osd_set) + { + if (!(loc.loc_bad & LOC_OUTDATED) && !corrupted_target[loc.role]) + { + corrupted_target[loc.role] = loc.osd_num; + } + } + submit_primary_subops(SUBMIT_RMW_READ, UINT64_MAX, corrupted_target, cur_op); + goto resume_2; + } + } + submit_primary_subops(SUBMIT_RMW_READ, UINT64_MAX, op_data->prev_set, cur_op); + } resume_2: op_data->st = 2; return; resume_3: if (op_data->errors > 0) { + if (op_data->errcode == -EIO || op_data->errcode == -EDOM) + { + // Mark object corrupted and retry + op_data->object_state = mark_object_corrupted(pg, op_data->oid, op_data->object_state, op_data->stripes, true); + op_data->prev_set = op_data->object_state ? op_data->object_state->read_target.data() : pg.cur_set.data(); + goto retry_1; + } deref_object_state(pg, &op_data->object_state, true); pg_cancel_write_queue(pg, cur_op, op_data->oid, op_data->errcode); return; diff --git a/src/osd_rmw.h b/src/osd_rmw.h index 672584c8..42c257fc 100644 --- a/src/osd_rmw.h +++ b/src/osd_rmw.h @@ -26,7 +26,9 @@ struct osd_rmw_stripe_t // read_end=UINT32_MAX means to only read bitmap, but not data uint32_t read_start, read_end; uint32_t write_start, write_end; - bool missing; + osd_num_t osd_num; + bool missing: 1; + bool read_error: 1; }; // Here pg_minsize is the number of data chunks, not the minimum number of alive OSDs for the PG to operate