From fe1ee67b05881ace8ff3075b6390c951461ec4ea Mon Sep 17 00:00:00 2001 From: Vitaliy Filippov Date: Wed, 15 Feb 2023 00:55:40 +0300 Subject: [PATCH] Add min/max stripe and limit to OP_LIST --- src/blockstore.h | 38 ++++++++++++++++----- src/blockstore_impl.cpp | 67 +++++++++++++++++++++++--------------- src/osd.cpp | 7 ++-- src/osd_flush.cpp | 4 ++- src/osd_ops.h | 5 +++ src/osd_peering.cpp | 11 ++++--- src/osd_primary_subops.cpp | 20 +++++++----- src/osd_secondary.cpp | 17 +++++++--- 8 files changed, 112 insertions(+), 57 deletions(-) diff --git a/src/blockstore.h b/src/blockstore.h index 9d3d334e..87ccbadf 100644 --- a/src/blockstore.h +++ b/src/blockstore.h @@ -122,11 +122,14 @@ Output: Get a list of all objects in this Blockstore. Input: -- oid.stripe = PG alignment -- len = PG count or 0 to list all objects -- offset = PG number -- oid.inode = min inode number or 0 to list all inodes -- version = max inode number or 0 to list all inodes +- pg_alignment = PG alignment +- pg_count = PG count or 0 to list all objects +- pg_number = PG number +- list_stable_limit = max number of clean objects in the reply + it's guaranteed that dirty objects are returned from the same interval, + i.e. from (min_oid .. min(max_oid, max(returned stable OIDs))) +- min_oid = min inode/stripe or 0 to list all objects +- max_oid = max inode/stripe or 0 to list all objects Output: - retval = total obj_ver_id count @@ -143,10 +146,27 @@ struct blockstore_op_t uint64_t opcode; // finish callback std::function callback; - object_id oid; - uint64_t version; - uint32_t offset; - uint32_t len; + union + { + // R/W + struct + { + object_id oid; + uint64_t version; + uint32_t offset; + uint32_t len; + }; + // List + struct __attribute__((__packed__)) + { + object_id min_oid; + object_id max_oid; + uint32_t pg_alignment; + uint32_t pg_count; + uint32_t pg_number; + uint32_t list_stable_limit; + }; + }; void *buf; void *bitmap; int retval; diff --git a/src/blockstore_impl.cpp b/src/blockstore_impl.cpp index f26f1fdb..b288441d 100644 --- a/src/blockstore_impl.cpp +++ b/src/blockstore_impl.cpp @@ -445,11 +445,11 @@ void blockstore_impl_t::reshard_clean_db(pool_id_t pool, uint32_t pg_count, uint void blockstore_impl_t::process_list(blockstore_op_t *op) { - uint32_t list_pg = op->offset+1; - uint32_t pg_count = op->len; - uint64_t pg_stripe_size = op->oid.stripe; - uint64_t min_inode = op->oid.inode; - uint64_t max_inode = op->version; + uint32_t list_pg = op->pg_number+1; + uint32_t pg_count = op->pg_count; + uint64_t pg_stripe_size = op->pg_alignment; + uint64_t min_inode = op->min_oid.inode; + uint64_t max_inode = op->max_oid.inode; // Check PG if (pg_count != 0 && (pg_stripe_size < MIN_DATA_BLOCK_SIZE || list_pg > pg_count)) { @@ -496,7 +496,13 @@ void blockstore_impl_t::process_list(blockstore_op_t *op) stable_alloc += clean_db.size(); } } - else + if (op->list_stable_limit > 0) + { + stable_alloc = op->list_stable_limit; + if (stable_alloc > 1024*1024) + stable_alloc = 1024*1024; + } + if (stable_alloc < 32768) { stable_alloc = 32768; } @@ -507,22 +513,21 @@ void blockstore_impl_t::process_list(blockstore_op_t *op) FINISH_OP(op); return; } + auto max_oid = op->max_oid; + bool limited = false; for (auto shard_it = clean_db_shards.lower_bound(first_shard); shard_it != clean_db_shards.end() && shard_it->first <= last_shard; shard_it++) { auto & clean_db = shard_it->second; auto clean_it = clean_db.begin(), clean_end = clean_db.end(); - if ((min_inode != 0 || max_inode != 0) && min_inode <= max_inode) + if (op->min_oid.inode != 0 || op->min_oid.stripe != 0) { - clean_it = clean_db.lower_bound({ - .inode = min_inode, - .stripe = 0, - }); - clean_end = clean_db.upper_bound({ - .inode = max_inode, - .stripe = UINT64_MAX, - }); + clean_it = clean_db.lower_bound(op->min_oid); + } + if ((max_oid.inode != 0 || max_oid.stripe != 0) && !(max_oid < op->min_oid)) + { + clean_end = clean_db.upper_bound(max_oid); } for (; clean_it != clean_end; clean_it++) { @@ -541,11 +546,24 @@ void blockstore_impl_t::process_list(blockstore_op_t *op) .oid = clean_it->first, .version = clean_it->second.version, }; + if (op->list_stable_limit > 0 && !limited && stable_count >= op->list_stable_limit) + { + limited = true; + break; + } + } + if (op->list_stable_limit > 0 && first_shard != last_shard) + { + // To maintain the order, we have to include objects in the same range from other shards + std::sort(stable, stable+stable_count); + if (stable_count > op->list_stable_limit) + stable_count = op->list_stable_limit; + max_oid = stable[stable_count-1].oid; } } - if (first_shard != last_shard) + if (op->list_stable_limit == 0 && first_shard != last_shard) { - // If that's not a per-PG listing, sort clean entries + // If that's not a per-PG listing, sort clean entries (already sorted if list_stable_limit != 0) std::sort(stable, stable+stable_count); } int clean_stable_count = stable_count; @@ -554,20 +572,17 @@ void blockstore_impl_t::process_list(blockstore_op_t *op) obj_ver_id *unstable = NULL; { auto dirty_it = dirty_db.begin(), dirty_end = dirty_db.end(); - if ((min_inode != 0 || max_inode != 0) && min_inode <= max_inode) + if (op->min_oid.inode != 0 || op->min_oid.stripe != 0) { dirty_it = dirty_db.lower_bound({ - .oid = { - .inode = min_inode, - .stripe = 0, - }, + .oid = op->min_oid, .version = 0, }); + } + if ((max_oid.inode != 0 || max_oid.stripe != 0) && !(max_oid < op->min_oid)) + { dirty_end = dirty_db.upper_bound({ - .oid = { - .inode = max_inode, - .stripe = UINT64_MAX, - }, + .oid = max_oid, .version = UINT64_MAX, }); } diff --git a/src/osd.cpp b/src/osd.cpp index da79ad3a..661f7466 100644 --- a/src/osd.cpp +++ b/src/osd.cpp @@ -444,10 +444,11 @@ void osd_t::print_slow() else if (op->req.hdr.opcode == OSD_OP_SEC_LIST) { bufprintf( - " inode=%lx-%lx pg=%u/%u, stripe=%lu", - op->req.sec_list.min_inode, op->req.sec_list.max_inode, + " oid=%lx/%lx-%lx/%lx pg=%u/%u, stripe=%lu, limit=%u", + op->req.sec_list.min_inode, op->req.sec_list.min_stripe, + op->req.sec_list.max_inode, op->req.sec_list.max_stripe, op->req.sec_list.list_pg, op->req.sec_list.pg_count, - op->req.sec_list.pg_stripe_size + op->req.sec_list.pg_stripe_size, op->req.sec_list.stable_limit ); } else if (op->req.hdr.opcode == OSD_OP_READ || op->req.hdr.opcode == OSD_OP_WRITE || diff --git a/src/osd_flush.cpp b/src/osd_flush.cpp index ed430b29..f91ffcf5 100644 --- a/src/osd_flush.cpp +++ b/src/osd_flush.cpp @@ -182,7 +182,9 @@ bool osd_t::submit_flush_op(pool_id_t pool_id, pg_num_t pg_num, pg_flush_batch_t op->bs_op = NULL; delete op; }, - .len = (uint32_t)count, + { + .len = (uint32_t)count, + }, .buf = op->buf, }); bs->enqueue_op(op->bs_op); diff --git a/src/osd_ops.h b/src/osd_ops.h index 9f232c75..9c59324a 100644 --- a/src/osd_ops.h +++ b/src/osd_ops.h @@ -173,6 +173,11 @@ struct __attribute__((__packed__)) osd_op_sec_list_t uint64_t pg_stripe_size; // inode range (used to select pools) uint64_t min_inode, max_inode; + // min/max oid stripe, added after inodes for backwards compatibility + // also for backwards compatibility, max_stripe=UINT64_MAX means 0 and 0 means UINT64_MAX O_o + uint64_t min_stripe, max_stripe; + // max stable object count + uint32_t stable_limit; }; struct __attribute__((__packed__)) osd_reply_sec_list_t diff --git a/src/osd_peering.cpp b/src/osd_peering.cpp index edb18717..51c0f25a 100644 --- a/src/osd_peering.cpp +++ b/src/osd_peering.cpp @@ -321,11 +321,12 @@ void osd_t::submit_list_subop(osd_num_t role_osd, pg_peering_state_t *ps) clock_gettime(CLOCK_REALTIME, &op->tv_begin); op->bs_op = new blockstore_op_t(); op->bs_op->opcode = BS_OP_LIST; - op->bs_op->oid.stripe = st_cli.pool_config[ps->pool_id].pg_stripe_size; - op->bs_op->oid.inode = ((uint64_t)ps->pool_id << (64 - POOL_ID_BITS)); - op->bs_op->version = ((uint64_t)(ps->pool_id+1) << (64 - POOL_ID_BITS)) - 1; - op->bs_op->len = pg_counts[ps->pool_id]; - op->bs_op->offset = ps->pg_num-1; + op->bs_op->pg_alignment = st_cli.pool_config[ps->pool_id].pg_stripe_size; + op->bs_op->min_oid.inode = ((uint64_t)ps->pool_id << (64 - POOL_ID_BITS)); + op->bs_op->max_oid.inode = ((uint64_t)(ps->pool_id+1) << (64 - POOL_ID_BITS)) - 1; + op->bs_op->max_oid.stripe = UINT64_MAX; + op->bs_op->pg_count = pg_counts[ps->pool_id]; + op->bs_op->pg_number = ps->pg_num-1; op->bs_op->callback = [this, ps, op, role_osd](blockstore_op_t *bs_op) { if (op->bs_op->retval < 0) diff --git a/src/osd_primary_subops.cpp b/src/osd_primary_subops.cpp index 3d359105..3b8727b0 100644 --- a/src/osd_primary_subops.cpp +++ b/src/osd_primary_subops.cpp @@ -158,19 +158,21 @@ int osd_t::submit_primary_subop_batch(int submit_type, inode_t inode, uint64_t o { clock_gettime(CLOCK_REALTIME, &subop->tv_begin); subop->op_type = (uint64_t)cur_op; - subop->bs_op = new blockstore_op_t({ + subop->bs_op = new blockstore_op_t((blockstore_op_t){ .opcode = (uint64_t)(wr ? (rep ? BS_OP_WRITE_STABLE : BS_OP_WRITE) : BS_OP_READ), .callback = [subop, this](blockstore_op_t *bs_subop) { handle_primary_bs_subop(subop); }, - .oid = { - .inode = inode, - .stripe = op_data->oid.stripe | stripe_num, + { + .oid = (object_id){ + .inode = inode, + .stripe = op_data->oid.stripe | stripe_num, + }, + .version = op_version, + .offset = wr ? stripes[stripe_num].write_start : stripes[stripe_num].read_start, + .len = wr ? stripes[stripe_num].write_end - stripes[stripe_num].write_start : stripes[stripe_num].read_end - stripes[stripe_num].read_start, }, - .version = op_version, - .offset = wr ? stripes[stripe_num].write_start : stripes[stripe_num].read_start, - .len = wr ? stripes[stripe_num].write_end - stripes[stripe_num].write_start : stripes[stripe_num].read_end - stripes[stripe_num].read_start, .buf = wr ? stripes[stripe_num].write_buf : stripes[stripe_num].read_buf, .bitmap = stripes[stripe_num].bmp_buf, }); @@ -621,7 +623,9 @@ void osd_t::submit_primary_stab_subops(osd_op_t *cur_op) { handle_primary_bs_subop(subop); }, - .len = (uint32_t)stab_osd.len, + { + .len = (uint32_t)stab_osd.len, + }, .buf = (void*)(op_data->unstable_writes + stab_osd.start), }); bs->enqueue_op(subops[i].bs_op); diff --git a/src/osd_secondary.cpp b/src/osd_secondary.cpp index 2fe77f69..2984bba5 100644 --- a/src/osd_secondary.cpp +++ b/src/osd_secondary.cpp @@ -125,11 +125,18 @@ void osd_t::exec_secondary(osd_op_t *cur_op) secondary_op_callback(cur_op); return; } - cur_op->bs_op->oid.stripe = cur_op->req.sec_list.pg_stripe_size; - cur_op->bs_op->len = cur_op->req.sec_list.pg_count; - cur_op->bs_op->offset = cur_op->req.sec_list.list_pg - 1; - cur_op->bs_op->oid.inode = cur_op->req.sec_list.min_inode; - cur_op->bs_op->version = cur_op->req.sec_list.max_inode; + cur_op->bs_op->pg_alignment = cur_op->req.sec_list.pg_stripe_size; + cur_op->bs_op->pg_count = cur_op->req.sec_list.pg_count; + cur_op->bs_op->pg_number = cur_op->req.sec_list.list_pg - 1; + cur_op->bs_op->min_oid.inode = cur_op->req.sec_list.min_inode; + cur_op->bs_op->min_oid.stripe = cur_op->req.sec_list.min_stripe; + cur_op->bs_op->max_oid.inode = cur_op->req.sec_list.max_inode; + if (cur_op->req.sec_list.max_inode && cur_op->req.sec_list.max_stripe != UINT64_MAX) + { + cur_op->bs_op->max_oid.stripe = cur_op->req.sec_list.max_stripe + ? cur_op->req.sec_list.max_stripe : UINT64_MAX; + } + cur_op->bs_op->list_stable_limit = cur_op->req.sec_list.stable_limit; #ifdef OSD_STUB cur_op->bs_op->retval = 0; cur_op->bs_op->buf = NULL;