diff --git a/src/blockstore.h b/src/blockstore.h index 265372c5..792f9835 100644 --- a/src/blockstore.h +++ b/src/blockstore.h @@ -122,11 +122,14 @@ Output: Get a list of all objects in this Blockstore. Input: -- oid.stripe = PG alignment -- len = PG count or 0 to list all objects -- offset = PG number -- oid.inode = min inode number or 0 to list all inodes -- version = max inode number or 0 to list all inodes +- pg_alignment = PG alignment +- pg_count = PG count or 0 to list all objects +- pg_number = PG number +- list_stable_limit = max number of clean objects in the reply + it's guaranteed that dirty objects are returned from the same interval, + i.e. from (min_oid .. min(max_oid, max(returned stable OIDs))) +- min_oid = min inode/stripe or 0 to list all objects +- max_oid = max inode/stripe or 0 to list all objects Output: - retval = total obj_ver_id count @@ -143,10 +146,27 @@ struct blockstore_op_t uint64_t opcode; // finish callback std::function callback; - object_id oid; - uint64_t version; - uint32_t offset; - uint32_t len; + union + { + // R/W + struct + { + object_id oid; + uint64_t version; + uint32_t offset; + uint32_t len; + }; + // List + struct __attribute__((__packed__)) + { + object_id min_oid; + object_id max_oid; + uint32_t pg_alignment; + uint32_t pg_count; + uint32_t pg_number; + uint32_t list_stable_limit; + }; + }; void *buf; void *bitmap; int retval; diff --git a/src/blockstore_impl.cpp b/src/blockstore_impl.cpp index 253d5257..8d9b6751 100644 --- a/src/blockstore_impl.cpp +++ b/src/blockstore_impl.cpp @@ -462,11 +462,11 @@ void blockstore_impl_t::reshard_clean_db(pool_id_t pool, uint32_t pg_count, uint void blockstore_impl_t::process_list(blockstore_op_t *op) { - uint32_t list_pg = op->offset+1; - uint32_t pg_count = op->len; - uint64_t pg_stripe_size = op->oid.stripe; - uint64_t min_inode = op->oid.inode; - uint64_t max_inode = op->version; + uint32_t list_pg = op->pg_number+1; + uint32_t pg_count = op->pg_count; + uint64_t pg_stripe_size = op->pg_alignment; + uint64_t min_inode = op->min_oid.inode; + uint64_t max_inode = op->max_oid.inode; // Check PG if (pg_count != 0 && (pg_stripe_size < MIN_DATA_BLOCK_SIZE || list_pg > pg_count)) { @@ -513,7 +513,13 @@ void blockstore_impl_t::process_list(blockstore_op_t *op) stable_alloc += clean_db.size(); } } - else + if (op->list_stable_limit > 0) + { + stable_alloc = op->list_stable_limit; + if (stable_alloc > 1024*1024) + stable_alloc = 1024*1024; + } + if (stable_alloc < 32768) { stable_alloc = 32768; } @@ -524,22 +530,22 @@ void blockstore_impl_t::process_list(blockstore_op_t *op) FINISH_OP(op); return; } + auto max_oid = op->max_oid; + bool limited = false; + pool_pg_id_t last_shard_id = 0; for (auto shard_it = clean_db_shards.lower_bound(first_shard); shard_it != clean_db_shards.end() && shard_it->first <= last_shard; shard_it++) { auto & clean_db = shard_it->second; auto clean_it = clean_db.begin(), clean_end = clean_db.end(); - if ((min_inode != 0 || max_inode != 0) && min_inode <= max_inode) + if (op->min_oid.inode != 0 || op->min_oid.stripe != 0) { - clean_it = clean_db.lower_bound({ - .inode = min_inode, - .stripe = 0, - }); - clean_end = clean_db.upper_bound({ - .inode = max_inode, - .stripe = UINT64_MAX, - }); + clean_it = clean_db.lower_bound(op->min_oid); + } + if ((max_oid.inode != 0 || max_oid.stripe != 0) && !(max_oid < op->min_oid)) + { + clean_end = clean_db.upper_bound(max_oid); } for (; clean_it != clean_end; clean_it++) { @@ -558,11 +564,29 @@ void blockstore_impl_t::process_list(blockstore_op_t *op) .oid = clean_it->first, .version = clean_it->second.version, }; + if (op->list_stable_limit > 0 && stable_count >= op->list_stable_limit) + { + if (!limited) + { + limited = true; + max_oid = stable[stable_count-1].oid; + } + break; + } } + if (op->list_stable_limit > 0) + { + // To maintain the order, we have to include objects in the same range from other shards + if (last_shard_id != 0 && last_shard_id != shard_it->first) + std::sort(stable, stable+stable_count); + if (stable_count > op->list_stable_limit) + stable_count = op->list_stable_limit; + } + last_shard_id = shard_it->first; } - if (first_shard != last_shard) + if (op->list_stable_limit == 0 && first_shard != last_shard) { - // If that's not a per-PG listing, sort clean entries + // If that's not a per-PG listing, sort clean entries (already sorted if list_stable_limit != 0) std::sort(stable, stable+stable_count); } int clean_stable_count = stable_count; @@ -571,20 +595,17 @@ void blockstore_impl_t::process_list(blockstore_op_t *op) obj_ver_id *unstable = NULL; { auto dirty_it = dirty_db.begin(), dirty_end = dirty_db.end(); - if ((min_inode != 0 || max_inode != 0) && min_inode <= max_inode) + if (op->min_oid.inode != 0 || op->min_oid.stripe != 0) { dirty_it = dirty_db.lower_bound({ - .oid = { - .inode = min_inode, - .stripe = 0, - }, + .oid = op->min_oid, .version = 0, }); + } + if ((max_oid.inode != 0 || max_oid.stripe != 0) && !(max_oid < op->min_oid)) + { dirty_end = dirty_db.upper_bound({ - .oid = { - .inode = max_inode, - .stripe = UINT64_MAX, - }, + .oid = max_oid, .version = UINT64_MAX, }); } @@ -628,6 +649,11 @@ void blockstore_impl_t::process_list(blockstore_op_t *op) stable[stable_count++] = dirty_it->first; } } + if (op->list_stable_limit > 0 && stable_count >= op->list_stable_limit) + { + // Stop here + break; + } } else { diff --git a/src/osd.cpp b/src/osd.cpp index 06ac4d82..8ee87cd2 100644 --- a/src/osd.cpp +++ b/src/osd.cpp @@ -502,10 +502,11 @@ void osd_t::print_slow() else if (op->req.hdr.opcode == OSD_OP_SEC_LIST) { bufprintf( - " inode=%lx-%lx pg=%u/%u, stripe=%lu", - op->req.sec_list.min_inode, op->req.sec_list.max_inode, + " oid=%lx/%lx-%lx/%lx pg=%u/%u, stripe=%lu, limit=%u", + op->req.sec_list.min_inode, op->req.sec_list.min_stripe, + op->req.sec_list.max_inode, op->req.sec_list.max_stripe, op->req.sec_list.list_pg, op->req.sec_list.pg_count, - op->req.sec_list.pg_stripe_size + op->req.sec_list.pg_stripe_size, op->req.sec_list.stable_limit ); } else if (op->req.hdr.opcode == OSD_OP_READ || op->req.hdr.opcode == OSD_OP_WRITE || diff --git a/src/osd_flush.cpp b/src/osd_flush.cpp index 79a9bf26..50fa3f86 100644 --- a/src/osd_flush.cpp +++ b/src/osd_flush.cpp @@ -192,7 +192,9 @@ bool osd_t::submit_flush_op(pool_id_t pool_id, pg_num_t pg_num, pg_flush_batch_t op->bs_op = NULL; delete op; }, - .len = (uint32_t)count, + { + .len = (uint32_t)count, + }, .buf = op->buf, }); bs->enqueue_op(op->bs_op); diff --git a/src/osd_ops.h b/src/osd_ops.h index 8055bbfc..edeeba7b 100644 --- a/src/osd_ops.h +++ b/src/osd_ops.h @@ -173,6 +173,11 @@ struct __attribute__((__packed__)) osd_op_sec_list_t uint64_t pg_stripe_size; // inode range (used to select pools) uint64_t min_inode, max_inode; + // min/max oid stripe, added after inodes for backwards compatibility + // also for backwards compatibility, max_stripe=UINT64_MAX means 0 and 0 means UINT64_MAX O_o + uint64_t min_stripe, max_stripe; + // max stable object count + uint32_t stable_limit; }; struct __attribute__((__packed__)) osd_reply_sec_list_t diff --git a/src/osd_peering.cpp b/src/osd_peering.cpp index 80e92b4c..55c4c218 100644 --- a/src/osd_peering.cpp +++ b/src/osd_peering.cpp @@ -321,11 +321,12 @@ void osd_t::submit_list_subop(osd_num_t role_osd, pg_peering_state_t *ps) clock_gettime(CLOCK_REALTIME, &op->tv_begin); op->bs_op = new blockstore_op_t(); op->bs_op->opcode = BS_OP_LIST; - op->bs_op->oid.stripe = st_cli.pool_config[ps->pool_id].pg_stripe_size; - op->bs_op->oid.inode = ((uint64_t)ps->pool_id << (64 - POOL_ID_BITS)); - op->bs_op->version = ((uint64_t)(ps->pool_id+1) << (64 - POOL_ID_BITS)) - 1; - op->bs_op->len = pg_counts[ps->pool_id]; - op->bs_op->offset = ps->pg_num-1; + op->bs_op->pg_alignment = st_cli.pool_config[ps->pool_id].pg_stripe_size; + op->bs_op->min_oid.inode = ((uint64_t)ps->pool_id << (64 - POOL_ID_BITS)); + op->bs_op->max_oid.inode = ((uint64_t)(ps->pool_id+1) << (64 - POOL_ID_BITS)) - 1; + op->bs_op->max_oid.stripe = UINT64_MAX; + op->bs_op->pg_count = pg_counts[ps->pool_id]; + op->bs_op->pg_number = ps->pg_num-1; op->bs_op->callback = [this, ps, op, role_osd](blockstore_op_t *bs_op) { if (op->bs_op->retval < 0) diff --git a/src/osd_primary_subops.cpp b/src/osd_primary_subops.cpp index 31c34dcb..e086ac25 100644 --- a/src/osd_primary_subops.cpp +++ b/src/osd_primary_subops.cpp @@ -168,19 +168,21 @@ int osd_t::submit_primary_subop_batch(int submit_type, inode_t inode, uint64_t o { clock_gettime(CLOCK_REALTIME, &subop->tv_begin); subop->op_type = (uint64_t)cur_op; - subop->bs_op = new blockstore_op_t({ + subop->bs_op = new blockstore_op_t((blockstore_op_t){ .opcode = (uint64_t)(wr ? (rep ? BS_OP_WRITE_STABLE : BS_OP_WRITE) : BS_OP_READ), .callback = [subop, this](blockstore_op_t *bs_subop) { handle_primary_bs_subop(subop); }, - .oid = { - .inode = inode, - .stripe = op_data->oid.stripe | stripe_num, + { + .oid = (object_id){ + .inode = inode, + .stripe = op_data->oid.stripe | stripe_num, + }, + .version = op_version, + .offset = wr ? stripes[stripe_num].write_start : stripes[stripe_num].read_start, + .len = subop_len, }, - .version = op_version, - .offset = wr ? stripes[stripe_num].write_start : stripes[stripe_num].read_start, - .len = subop_len, .buf = wr ? stripes[stripe_num].write_buf : stripes[stripe_num].read_buf, .bitmap = stripes[stripe_num].bmp_buf, }); @@ -631,7 +633,9 @@ void osd_t::submit_primary_stab_subops(osd_op_t *cur_op) { handle_primary_bs_subop(subop); }, - .len = (uint32_t)stab_osd.len, + { + .len = (uint32_t)stab_osd.len, + }, .buf = (void*)(op_data->unstable_writes + stab_osd.start), }); bs->enqueue_op(subops[i].bs_op); diff --git a/src/osd_secondary.cpp b/src/osd_secondary.cpp index 2fe77f69..2984bba5 100644 --- a/src/osd_secondary.cpp +++ b/src/osd_secondary.cpp @@ -125,11 +125,18 @@ void osd_t::exec_secondary(osd_op_t *cur_op) secondary_op_callback(cur_op); return; } - cur_op->bs_op->oid.stripe = cur_op->req.sec_list.pg_stripe_size; - cur_op->bs_op->len = cur_op->req.sec_list.pg_count; - cur_op->bs_op->offset = cur_op->req.sec_list.list_pg - 1; - cur_op->bs_op->oid.inode = cur_op->req.sec_list.min_inode; - cur_op->bs_op->version = cur_op->req.sec_list.max_inode; + cur_op->bs_op->pg_alignment = cur_op->req.sec_list.pg_stripe_size; + cur_op->bs_op->pg_count = cur_op->req.sec_list.pg_count; + cur_op->bs_op->pg_number = cur_op->req.sec_list.list_pg - 1; + cur_op->bs_op->min_oid.inode = cur_op->req.sec_list.min_inode; + cur_op->bs_op->min_oid.stripe = cur_op->req.sec_list.min_stripe; + cur_op->bs_op->max_oid.inode = cur_op->req.sec_list.max_inode; + if (cur_op->req.sec_list.max_inode && cur_op->req.sec_list.max_stripe != UINT64_MAX) + { + cur_op->bs_op->max_oid.stripe = cur_op->req.sec_list.max_stripe + ? cur_op->req.sec_list.max_stripe : UINT64_MAX; + } + cur_op->bs_op->list_stable_limit = cur_op->req.sec_list.stable_limit; #ifdef OSD_STUB cur_op->bs_op->retval = 0; cur_op->bs_op->buf = NULL;