
249 lines
6.0 KiB
Raw Normal View History

2019-12-15 01:11:51 +03:00
#pragma once
#include <sys/types.h>
2020-01-09 20:20:56 +03:00
#include <sys/time.h>
#include <sys/ioctl.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <unistd.h>
#include <malloc.h>
2019-12-15 01:11:51 +03:00
#include <arpa/inet.h>
#include <malloc.h>
2019-12-15 01:11:51 +03:00
#include <unordered_map>
#include <deque>
2019-12-15 01:11:51 +03:00
#include "blockstore.h"
2019-12-15 01:11:51 +03:00
#include "ringloop.h"
#include "osd_ops.h"
#include "sparsepp/sparsepp/spp.h"
#define STRIPE_NUM(stripe) ((stripe) >> 4)
#define STRIPE_REPLICA(stripe) ((stripe) & 0xf)
#define OSD_OP_IN 0
#define OSD_OP_OUT 1
2019-12-28 01:25:55 +03:00
#define CL_READ_OP 1
#define CL_READ_DATA 2
#define SQE_SENT 0x100l
#define CL_WRITE_READY 1
#define CL_WRITE_REPLY 2
#define CL_WRITE_DATA 3
//#define OSD_STUB
2019-12-15 01:11:51 +03:00
struct osd_op_t
int op_type;
2019-12-15 01:11:51 +03:00
int peer_fd;
osd_any_op_t op;
uint8_t op_buf[OSD_PACKET_SIZE] = { 0 };
2019-12-15 01:11:51 +03:00
osd_any_reply_t reply;
uint8_t reply_buf[OSD_PACKET_SIZE] = { 0 };
2019-12-15 01:11:51 +03:00
blockstore_op_t bs_op;
2019-12-15 01:11:51 +03:00
void *buf = NULL;
std::function<void(osd_op_t*)> callback;
2019-12-15 01:11:51 +03:00
2019-12-15 01:11:51 +03:00
2020-01-04 01:23:25 +03:00
2019-12-15 01:11:51 +03:00
struct osd_client_t
sockaddr_in peer_addr;
2020-01-04 01:23:25 +03:00
int peer_port;
2019-12-15 01:11:51 +03:00
int peer_fd;
2020-01-04 01:23:25 +03:00
int peer_state;
std::function<void(int)> connect_callback;
uint64_t osd_num = 0;
2019-12-15 01:11:51 +03:00
//int in_flight_ops = 0;
// Read state
bool read_ready = false;
bool reading = false;
osd_op_t *read_op = NULL;
int read_reply_id = 0;
2019-12-15 01:11:51 +03:00
iovec read_iov;
msghdr read_msg;
void *read_buf = NULL;
int read_remaining = 0;
int read_state = 0;
// Outbound operations sent to this client (which is probably an OSD peer)
std::map<int, osd_op_t*> sent_ops;
// Outbound messages (replies or requests)
std::deque<osd_op_t*> outbox;
2019-12-15 01:11:51 +03:00
// Write state
osd_op_t *write_op = NULL;
iovec write_iov;
msghdr write_msg;
void *write_buf = NULL;
int write_remaining = 0;
int write_state = 0;
struct osd_pg_role_t
// role = (stripe role: 1, 2, 3, ...) | (stable ? 0 : 1<<63)
uint64_t role;
uint64_t osd_num;
typedef std::vector<osd_pg_role_t> osd_acting_set_t;
namespace std
template<> struct hash<osd_acting_set_t>
inline size_t operator()(const osd_acting_set_t &s) const
size_t seed = 0;
for (int i = 0; i < s.size(); i++)
// Copy-pasted from spp::hash_combine()
seed ^= (s[i].role + 0xc6a4a7935bd1e995 + (seed << 6) + (seed >> 2));
seed ^= (s[i].osd_num + 0xc6a4a7935bd1e995 + (seed << 6) + (seed >> 2));
return seed;
// Placement group states
2020-01-04 01:23:25 +03:00
// Exactly one of these:
#define PG_OFFLINE (1<<0)
#define PG_PEERING (1<<1)
#define PG_INCOMPLETE (1<<2)
#define PG_ACTIVE (1<<3)
// Plus any of these:
#define PG_HAS_UNFOUND (1<<4)
#define PG_HAS_DEGRADED (1<<5)
#define PG_HAS_MISPLACED (1<<6)
// OSD object states
#define OSD_CLEAN 0x01
#define OSD_MISPLACED 0x02
#define OSD_DEGRADED 0x03
#define OSD_INCOMPLETE 0x04
#define OSD_HALF_STABLE 0x10000
#define OSD_NEEDS_ROLLBACK 0x20000
class osd_t;
struct osd_pg_peering_state_t
osd_t* self;
uint64_t pg_num;
std::unordered_map<uint64_t, osd_op_t*> list_ops;
int list_done = 0;
struct osd_pg_t
int state;
uint64_t pg_num;
2020-01-04 01:23:25 +03:00
uint64_t n_unfound = 0, n_degraded = 0, n_misplaced = 0;
std::vector<osd_pg_role_t> target_set;
// moved object map. by default, each object is considered to reside on the target_set.
// this map stores all objects that differ.
// this map may consume up to ~ (raw storage / object size) * 24 bytes in the worst case scenario
// which is up to ~192 MB per 1 TB in the worst case scenario
std::unordered_map<osd_acting_set_t, int> acting_set_ids;
std::map<int, osd_acting_set_t> acting_sets;
spp::sparse_hash_map<object_id, int> object_map;
osd_pg_peering_state_t *peering_state = NULL;
struct osd_peer_def_t
uint64_t osd_num = 0;
std::string addr;
int port = 0;
time_t last_connect_attempt = 0;
2019-12-15 01:11:51 +03:00
class osd_t
// config
2020-01-04 01:23:25 +03:00
uint64_t osd_num = 1; // OSD numbers start with 1
bool run_primary = false;
std::vector<osd_peer_def_t> peers;
blockstore_config_t config;
2019-12-15 01:52:08 +03:00
std::string bind_address;
int bind_port, listen_backlog;
2019-12-15 01:11:51 +03:00
int client_queue_depth = 128;
2019-12-15 15:30:51 +03:00
bool allow_test_ops = true;
2019-12-15 01:11:51 +03:00
// peer OSDs
std::map<uint64_t, int> osd_peer_fds;
std::vector<osd_pg_t> pgs;
int peering_state = 0;
unsigned pg_count = 0;
// client & peer I/O
2019-12-15 01:11:51 +03:00
2019-12-19 22:16:04 +03:00
bool stopping = false;
int inflight_ops = 0;
blockstore_t *bs;
2019-12-15 01:11:51 +03:00
ring_loop_t *ringloop;
int wait_state = 0;
int epoll_fd = 0;
int listen_fd = 0;
ring_consumer_t consumer;
std::unordered_map<int,osd_client_t> clients;
std::vector<int> read_ready_clients;
std::vector<int> write_ready_clients;
// methods
2020-01-04 01:23:25 +03:00
// event loop, socket read/write
2019-12-15 01:11:51 +03:00
void loop();
int handle_epoll_events();
void read_requests();
void handle_read(ring_data_t *data, int peer_fd);
void handle_read_op(osd_client_t *cl);
void handle_read_reply(osd_client_t *cl);
2019-12-15 01:11:51 +03:00
void send_replies();
void make_reply(osd_op_t *op);
void handle_send(ring_data_t *data, int peer_fd);
void outbox_push(osd_client_t & cl, osd_op_t *op);
// peer handling (primary OSD logic)
void connect_peer(unsigned osd_num, const char *peer_host, int peer_port, std::function<void(int)> callback);
2020-01-04 01:23:25 +03:00
void handle_connect_result(int peer_fd);
void stop_client(int peer_fd);
osd_peer_def_t parse_peer(std::string peer);
void init_primary();
void handle_peers();
void start_pg_peering(int i);
2020-01-04 01:23:25 +03:00
// op execution
2019-12-28 01:25:55 +03:00
void exec_op(osd_op_t *cur_op);
void exec_sync_stab_all(osd_op_t *cur_op);
void exec_show_config(osd_op_t *cur_op);
void exec_secondary(osd_op_t *cur_op);
2019-12-19 13:56:26 +03:00
void secondary_op_callback(osd_op_t *cur_op);
2019-12-15 01:11:51 +03:00
osd_t(blockstore_config_t & config, blockstore_t *bs, ring_loop_t *ringloop);
2019-12-15 01:11:51 +03:00
2019-12-15 01:52:08 +03:00
bool shutdown();
2019-12-15 01:11:51 +03:00