#pragma once #include #include #include #include #include #include #include #include #include #include #include #include "blockstore.h" #include "ringloop.h" #include "osd_ops.h" #include "sparsepp/sparsepp/spp.h" #define STRIPE_NUM(stripe) ((stripe) >> 4) #define STRIPE_REPLICA(stripe) ((stripe) & 0xf) #define OSD_OP_IN 0 #define OSD_OP_OUT 1 #define CL_READ_OP 1 #define CL_READ_DATA 2 #define CL_READ_REPLY_DATA 3 #define SQE_SENT 0x100l #define CL_WRITE_READY 1 #define CL_WRITE_REPLY 2 #define CL_WRITE_DATA 3 #define MAX_EPOLL_EVENTS 16 //#define OSD_STUB struct osd_op_t { int op_type; int peer_fd; union { osd_any_op_t op; uint8_t op_buf[OSD_PACKET_SIZE] = { 0 }; }; union { osd_any_reply_t reply; uint8_t reply_buf[OSD_PACKET_SIZE] = { 0 }; }; blockstore_op_t bs_op; void *buf = NULL; std::function callback; ~osd_op_t(); }; #define PEER_CONNECTING 1 #define PEER_CONNECTED 2 struct osd_client_t { sockaddr_in peer_addr; int peer_port; int peer_fd; int peer_state; std::function connect_callback; uint64_t osd_num = 0; //int in_flight_ops = 0; // Read state bool read_ready = false; bool reading = false; osd_op_t *read_op = NULL; int read_reply_id = 0; iovec read_iov; msghdr read_msg; void *read_buf = NULL; int read_remaining = 0; int read_state = 0; // Outbound operations sent to this client (which is probably an OSD peer) std::map sent_ops; // Outbound messages (replies or requests) std::deque outbox; // Write state osd_op_t *write_op = NULL; iovec write_iov; msghdr write_msg; void *write_buf = NULL; int write_remaining = 0; int write_state = 0; }; struct osd_pg_role_t { // role = (stripe role: 1, 2, 3, ...) | (stable ? 0 : 1<<63) uint64_t role; uint64_t osd_num; }; typedef std::vector osd_acting_set_t; namespace std { template<> struct hash { inline size_t operator()(const osd_acting_set_t &s) const { size_t seed = 0; for (int i = 0; i < s.size(); i++) { // Copy-pasted from spp::hash_combine() seed ^= (s[i].role + 0xc6a4a7935bd1e995 + (seed << 6) + (seed >> 2)); seed ^= (s[i].osd_num + 0xc6a4a7935bd1e995 + (seed << 6) + (seed >> 2)); } return seed; } }; } // Placement group states // Exactly one of these: #define PG_OFFLINE (1<<0) #define PG_PEERING (1<<1) #define PG_INCOMPLETE (1<<2) #define PG_ACTIVE (1<<3) // Plus any of these: #define PG_HAS_UNFOUND (1<<4) #define PG_HAS_DEGRADED (1<<5) #define PG_HAS_MISPLACED (1<<6) // OSD object states #define OSD_CLEAN 0x01 #define OSD_MISPLACED 0x02 #define OSD_DEGRADED 0x03 #define OSD_INCOMPLETE 0x04 #define OSD_HALF_STABLE 0x10000 #define OSD_NEEDS_ROLLBACK 0x20000 class osd_t; struct osd_pg_peering_state_t { osd_t* self; uint64_t pg_num; std::unordered_map list_ops; int list_done = 0; }; struct osd_pg_t { int state; uint64_t pg_num; uint64_t n_unfound = 0, n_degraded = 0, n_misplaced = 0; std::vector target_set; // moved object map. by default, each object is considered to reside on the target_set. // this map stores all objects that differ. // this map may consume up to ~ (raw storage / object size) * 24 bytes in the worst case scenario // which is up to ~192 MB per 1 TB in the worst case scenario std::unordered_map acting_set_ids; std::map acting_sets; spp::sparse_hash_map object_map; osd_pg_peering_state_t *peering_state = NULL; }; struct osd_peer_def_t { uint64_t osd_num = 0; std::string addr; int port = 0; time_t last_connect_attempt = 0; }; class osd_t { // config uint64_t osd_num = 1; // OSD numbers start with 1 bool run_primary = false; std::vector peers; blockstore_config_t config; std::string bind_address; int bind_port, listen_backlog; int client_queue_depth = 128; bool allow_test_ops = true; // peer OSDs std::map osd_peer_fds; std::vector pgs; int peering_state = 0; unsigned pg_count = 0; // client & peer I/O bool stopping = false; int inflight_ops = 0; blockstore_t *bs; ring_loop_t *ringloop; int wait_state = 0; int epoll_fd = 0; int listen_fd = 0; ring_consumer_t consumer; std::unordered_map clients; std::vector read_ready_clients; std::vector write_ready_clients; // methods // event loop, socket read/write void loop(); int handle_epoll_events(); void read_requests(); void handle_read(ring_data_t *data, int peer_fd); void handle_read_op(osd_client_t *cl); void handle_read_reply(osd_client_t *cl); void send_replies(); void make_reply(osd_op_t *op); void handle_send(ring_data_t *data, int peer_fd); void outbox_push(osd_client_t & cl, osd_op_t *op); // peer handling (primary OSD logic) void connect_peer(unsigned osd_num, const char *peer_host, int peer_port, std::function callback); void handle_connect_result(int peer_fd); void stop_client(int peer_fd); osd_peer_def_t parse_peer(std::string peer); void init_primary(); void handle_peers(); void start_pg_peering(int i); // op execution void exec_op(osd_op_t *cur_op); void exec_sync_stab_all(osd_op_t *cur_op); void exec_show_config(osd_op_t *cur_op); void exec_secondary(osd_op_t *cur_op); void secondary_op_callback(osd_op_t *cur_op); public: osd_t(blockstore_config_t & config, blockstore_t *bs, ring_loop_t *ringloop); ~osd_t(); bool shutdown(); };