#pragma once #include #include #include #include #include #include #include #include #include #include #include #include "blockstore.h" #include "ringloop.h" #include "osd_ops.h" #include "sparsepp/sparsepp/spp.h" #define STRIPE_NUM(stripe) ((stripe) >> 4) #define STRIPE_REPLICA(stripe) ((stripe) & 0xf) #define OSD_OP_IN 0 #define OSD_OP_OUT 1 #define CL_READ_OP 1 #define CL_READ_DATA 2 #define CL_READ_REPLY_DATA 3 #define SQE_SENT 0x100l #define CL_WRITE_READY 1 #define CL_WRITE_REPLY 2 #define CL_WRITE_DATA 3 #define MAX_EPOLL_EVENTS 16 //#define OSD_STUB struct osd_op_t { int op_type; int peer_fd; union { osd_any_op_t op; uint8_t op_buf[OSD_PACKET_SIZE] = { 0 }; }; union { osd_any_reply_t reply; uint8_t reply_buf[OSD_PACKET_SIZE] = { 0 }; }; blockstore_op_t bs_op; void *buf = NULL; std::function callback; ~osd_op_t(); }; #define PEER_CONNECTING 1 #define PEER_CONNECTED 2 struct osd_client_t { sockaddr_in peer_addr; int peer_port; int peer_fd; int peer_state; std::function connect_callback; uint64_t osd_num = 0; //int in_flight_ops = 0; // Read state bool read_ready = false; bool reading = false; osd_op_t *read_op = NULL; int read_reply_id = 0; iovec read_iov; msghdr read_msg; void *read_buf = NULL; int read_remaining = 0; int read_state = 0; // Outbound operations sent to this client (which is probably an OSD peer) std::map sent_ops; // Outbound messages (replies or requests) std::deque outbox; // Write state osd_op_t *write_op = NULL; iovec write_iov; msghdr write_msg; void *write_buf = NULL; int write_remaining = 0; int write_state = 0; }; struct osd_obj_loc_t { uint64_t role; uint64_t osd_num; bool stable; }; inline bool operator < (const osd_obj_loc_t &a, const osd_obj_loc_t &b) { return a.role < b.role || a.role == b.role && a.osd_num < b.osd_num; } struct osd_obj_state_t { std::vector loc; uint64_t state = 0; uint64_t object_count = 0; }; struct osd_ver_override_t { uint64_t max_ver; uint64_t target_ver; }; inline bool operator < (const osd_obj_state_t &a, const osd_obj_state_t &b) { return a.loc < b.loc; } namespace std { template<> struct hash { inline size_t operator()(const osd_obj_state_t &s) const { size_t seed = 0; for (int i = 0; i < s.loc.size(); i++) { // Copy-pasted from spp::hash_combine() seed ^= (s.loc[i].role + 0xc6a4a7935bd1e995 + (seed << 6) + (seed >> 2)); seed ^= (s.loc[i].osd_num + 0xc6a4a7935bd1e995 + (seed << 6) + (seed >> 2)); seed ^= ((s.loc[i].stable ? 1 : 0) + 0xc6a4a7935bd1e995 + (seed << 6) + (seed >> 2)); } return seed; } }; } // Placement group states // Exactly one of these: #define PG_OFFLINE (1<<0) #define PG_PEERING (1<<1) #define PG_INCOMPLETE (1<<2) #define PG_ACTIVE (1<<3) // Plus any of these: #define PG_HAS_UNFOUND (1<<4) #define PG_HAS_DEGRADED (1<<5) #define PG_HAS_MISPLACED (1<<6) // OSD object states #define OBJ_CLEAN 0x01 #define OBJ_MISPLACED 0x02 #define OBJ_DEGRADED 0x03 #define OBJ_INCOMPLETE 0x04 #define OBJ_NONSTABILIZED 0x10000 #define OBJ_UNDERWRITTEN 0x20000 #define OBJ_OVERCOPIED 0x40000 #define OBJ_BUGGY 0x80000 class osd_t; struct osd_pg_peering_state_t { osd_t* self; // FIXME: add types for pg_num and osd_num? uint64_t pg_num; std::unordered_map list_ops; int list_done = 0; }; struct osd_pg_t { int state; uint64_t pg_size = 3, pg_minsize = 2; uint64_t pg_num; // target_set = (role => osd_num). role starts from zero std::vector target_set; // moved object map. by default, each object is considered to reside on the target_set. // this map stores all objects that differ. // this map may consume up to ~ (raw storage / object size) * 24 bytes in the worst case scenario // which is up to ~192 MB per 1 TB in the worst case scenario std::set state_dict; spp::sparse_hash_map obj_states; spp::sparse_hash_map ver_override; osd_pg_peering_state_t *peering_state = NULL; }; struct obj_ver_role { object_id oid; uint64_t version; uint64_t osd_num; bool is_stable; }; inline bool operator < (const obj_ver_role & a, const obj_ver_role & b) { return a.oid < b.oid || // object versions go in descending order a.oid == b.oid && a.version > b.version || a.oid == b.oid && a.version == b.version || a.oid == b.oid && a.version == b.version && a.osd_num < b.osd_num; } // Max 64 replicas #define STRIPE_MASK 0x3F #define STRIPE_SHIFT 6 struct osd_obj_state_check_t { int start = 0; object_id oid = { 0 }; uint64_t max_ver = 0; uint64_t target_ver = 0; uint64_t n_copies = 0, has_roles = 0, n_roles = 0, n_stable = 0, n_matched = 0; bool is_buggy = false; osd_obj_state_t state_obj; }; struct osd_peer_def_t { uint64_t osd_num = 0; std::string addr; int port = 0; time_t last_connect_attempt = 0; }; class osd_t { // config uint64_t osd_num = 1; // OSD numbers start with 1 bool run_primary = false; std::vector peers; blockstore_config_t config; std::string bind_address; int bind_port, listen_backlog; int client_queue_depth = 128; bool allow_test_ops = true; // peer OSDs std::map osd_peer_fds; std::vector pgs; int peering_state = 0; unsigned pg_count = 0; // client & peer I/O bool stopping = false; int inflight_ops = 0; blockstore_t *bs; ring_loop_t *ringloop; int wait_state = 0; int epoll_fd = 0; int listen_fd = 0; ring_consumer_t consumer; std::unordered_map clients; std::vector read_ready_clients; std::vector write_ready_clients; // methods // event loop, socket read/write void loop(); int handle_epoll_events(); void read_requests(); void handle_read(ring_data_t *data, int peer_fd); void handle_read_op(osd_client_t *cl); void handle_read_reply(osd_client_t *cl); void send_replies(); void make_reply(osd_op_t *op); void handle_send(ring_data_t *data, int peer_fd); void outbox_push(osd_client_t & cl, osd_op_t *op); // peer handling (primary OSD logic) void connect_peer(unsigned osd_num, const char *peer_host, int peer_port, std::function callback); void handle_connect_result(int peer_fd); void stop_client(int peer_fd); osd_peer_def_t parse_peer(std::string peer); void init_primary(); void handle_peers(); void start_pg_peering(int i); void calc_object_states(osd_pg_t &pg); void remember_object(osd_pg_t &pg, osd_obj_state_check_t &st, std::vector &all, int end); // op execution void exec_op(osd_op_t *cur_op); void exec_sync_stab_all(osd_op_t *cur_op); void exec_show_config(osd_op_t *cur_op); void exec_secondary(osd_op_t *cur_op); void secondary_op_callback(osd_op_t *cur_op); public: osd_t(blockstore_config_t & config, blockstore_t *bs, ring_loop_t *ringloop); ~osd_t(); bool shutdown(); };