diff --git a/src/blockstore.h b/src/blockstore.h index 792f9835..3bc3a506 100644 --- a/src/blockstore.h +++ b/src/blockstore.h @@ -73,7 +73,10 @@ Input: write request is copied into the metadata area bitwise and stored there. Output: -- retval = number of bytes actually read/written or negative error number (-EINVAL or -ENOSPC) +- retval = number of bytes actually read/written or negative error number + -EINVAL = invalid input parameters + -ENOENT = requested object/version does not exist for reads + -ENOSPC = no space left in the store for writes - version = the version actually read or written ## BS_OP_DELETE diff --git a/src/blockstore_read.cpp b/src/blockstore_read.cpp index cd2f64e4..3d818517 100644 --- a/src/blockstore_read.cpp +++ b/src/blockstore_read.cpp @@ -124,10 +124,8 @@ int blockstore_impl_t::dequeue_read(blockstore_op_t *read_op) bool dirty_found = (dirty_it != dirty_db.end() && dirty_it->first.oid == read_op->oid); if (!clean_found && !dirty_found) { - // region is not allocated - return zeroes - memset(read_op->buf, 0, read_op->len); read_op->version = 0; - read_op->retval = read_op->len; + read_op->retval = -ENOENT; FINISH_OP(read_op); return 2; } @@ -149,8 +147,10 @@ int blockstore_impl_t::dequeue_read(blockstore_op_t *read_op) if (IS_DELETE(dirty.state)) { assert(!result_version); - clean_found = false; - break; + read_op->version = 0; + read_op->retval = -ENOENT; + FINISH_OP(read_op); + return 2; } if (!result_version) { @@ -238,12 +238,19 @@ int blockstore_impl_t::dequeue_read(blockstore_op_t *read_op) } } } - else if (fulfilled < read_op->len) + if (!result_version) { - // fill remaining parts with zeroes - assert(fulfill_read(read_op, fulfilled, 0, dsk.data_block_size, (BS_ST_DELETE | BS_ST_STABLE), 0, 0, 0)); + // May happen if there are entries in dirty_db but all of them are !version_ok + read_op->version = 0; + read_op->retval = -ENOENT; + FINISH_OP(read_op); + return 2; + } + if (fulfilled < read_op->len) + { + assert(fulfill_read(read_op, fulfilled, 0, dsk.data_block_size, (BS_ST_DELETE | BS_ST_STABLE), 0, 0, 0)); + assert(fulfilled == read_op->len); } - assert(fulfilled == read_op->len); read_op->version = result_version; if (!PRIV(read_op)->pending_ops) { diff --git a/src/osd_primary.cpp b/src/osd_primary.cpp index 6f24a016..53f9050a 100644 --- a/src/osd_primary.cpp +++ b/src/osd_primary.cpp @@ -329,21 +329,40 @@ pg_osd_set_state_t *osd_t::mark_object_corrupted(pg_t & pg, object_id oid, pg_os } // Mark object chunk(s) as corrupted int changes = 0; - for (auto & chunk: corrupted_set) + for (auto chunk_it = corrupted_set.begin(); chunk_it != corrupted_set.end(); ) { - bool corrupted = stripes[chunk.role].osd_num == chunk.osd_num && stripes[chunk.role].read_error; - if (corrupted) + auto & chunk = *chunk_it; + if (stripes[chunk.role].osd_num == chunk.osd_num) { - if (!(chunk.loc_bad & LOC_CORRUPTED)) + if (stripes[chunk.role].not_exists) + { changes++; - chunk.loc_bad |= LOC_CORRUPTED; + corrupted_set.erase(chunk_it, chunk_it+1); + continue; + } + if (stripes[chunk.role].read_error && chunk.loc_bad != LOC_CORRUPTED) + { + changes++; + chunk.loc_bad = LOC_CORRUPTED; + } + else if (stripes[chunk.role].read_end > 0 && !stripes[chunk.role].missing && + (chunk.loc_bad & LOC_CORRUPTED)) + { + changes++; + chunk.loc_bad &= ~LOC_CORRUPTED; + } } - else if (inconsistent && !(chunk.loc_bad & LOC_OUTDATED)) + if (inconsistent && !chunk.loc_bad) { - if (!(chunk.loc_bad & LOC_INCONSISTENT)) - changes++; + changes++; chunk.loc_bad |= LOC_INCONSISTENT; } + else if (!inconsistent && (chunk.loc_bad & LOC_INCONSISTENT)) + { + changes++; + chunk.loc_bad &= ~LOC_INCONSISTENT; + } + chunk_it++; } if (!changes) { diff --git a/src/osd_primary_subops.cpp b/src/osd_primary_subops.cpp index 255944f1..e79f3f34 100644 --- a/src/osd_primary_subops.cpp +++ b/src/osd_primary_subops.cpp @@ -344,6 +344,13 @@ void osd_t::handle_primary_subop(osd_op_t *subop, osd_op_t *cur_op) else expected = 0; osd_primary_op_data_t *op_data = cur_op->op_data; + if (retval == -ENOENT && opcode == OSD_OP_SEC_READ) + { + // ENOENT is not an error for almost all reads, except scrub + retval = expected; + memset(((osd_rmw_stripe_t*)subop->rmw_buf)->read_buf, 0, expected); + ((osd_rmw_stripe_t*)subop->rmw_buf)->not_exists = true; + } if (retval == expected && (opcode == OSD_OP_SEC_READ || opcode == OSD_OP_SEC_WRITE || opcode == OSD_OP_SEC_WRITE_STABLE)) { uint64_t version = subop->reply.sec_rw.version; diff --git a/src/osd_rmw.cpp b/src/osd_rmw.cpp index eb168280..f3b2ccd3 100644 --- a/src/osd_rmw.cpp +++ b/src/osd_rmw.cpp @@ -1131,12 +1131,14 @@ std::vector ec_find_good(osd_rmw_stripe_t *stripes, int pg_size, int pg_min uint32_t chunk_size, uint32_t bitmap_size, int max_bruteforce) { std::vector found_valid; - int cur_live[pg_size], live_count = 0; + int cur_live[pg_size], live_count = 0, exists_count = 0; osd_num_t fake_osd_set[pg_size]; for (int role = 0; role < pg_size; role++) { if (!stripes[role].missing) { + if (!stripes[role].not_exists) + exists_count++; cur_live[live_count++] = role; fake_osd_set[role] = role+1; } @@ -1145,6 +1147,14 @@ std::vector ec_find_good(osd_rmw_stripe_t *stripes, int pg_size, int pg_min { return std::vector(); } + if (exists_count <= pg_minsize) + { + // Special case: user manually deleted some chunks + for (int role = 0; role < pg_size; role++) + if (!stripes[role].missing && !stripes[role].not_exists) + found_valid.push_back(role); + return found_valid; + } // Try to locate errors using brute force if there isn't too many combinations osd_rmw_stripe_t brute_stripes[pg_size]; int out_count = live_count-pg_minsize; diff --git a/src/osd_rmw.h b/src/osd_rmw.h index f09e663e..60255b2b 100644 --- a/src/osd_rmw.h +++ b/src/osd_rmw.h @@ -30,6 +30,7 @@ struct osd_rmw_stripe_t osd_num_t osd_num; bool missing: 1; bool read_error: 1; + bool not_exists: 1; }; // Here pg_minsize is the number of data chunks, not the minimum number of alive OSDs for the PG to operate diff --git a/src/osd_scrub.cpp b/src/osd_scrub.cpp index f673e76b..33932de7 100644 --- a/src/osd_scrub.cpp +++ b/src/osd_scrub.cpp @@ -455,7 +455,8 @@ resume_2: for (int role = 0; role < op_data->pg_size; role++) { eq_to[role] = -1; - if (op_data->stripes[role].read_end != 0 && !op_data->stripes[role].missing) + if (op_data->stripes[role].read_end != 0 && !op_data->stripes[role].missing && + !op_data->stripes[role].not_exists) { total++; eq_to[role] = role; @@ -560,7 +561,9 @@ resume_2: } for (int role = 0; role < op_data->pg_size; role++) { - if (op_data->stripes[role].osd_num != 0 && op_data->stripes[role].read_error || inconsistent) + if (op_data->stripes[role].osd_num != 0 && + (op_data->stripes[role].read_error || op_data->stripes[role].not_exists) || + inconsistent) { // Got at least 1 read error or mismatch, mark the object as corrupted auto & pg = pgs.at({ .pool_id = INODE_POOL(op_data->oid.inode), .pg_num = op_data->pg_num });