forked from vitalif/vitastor
Implement PG state locking and PG moving in response to etcd events
parent
ec4a52af48
commit
7b57eeeeb3
2
Makefile
2
Makefile
|
@ -67,6 +67,8 @@ rw_blocking.o: rw_blocking.cpp rw_blocking.h
|
|||
g++ $(CXXFLAGS) -c -o $@ $<
|
||||
osd_test: osd_test.cpp osd_ops.h rw_blocking.o
|
||||
g++ $(CXXFLAGS) -o osd_test osd_test.cpp rw_blocking.o -ltcmalloc_minimal
|
||||
osd_peering_pg_test: osd_peering_pg_test.cpp osd_peering_pg.o
|
||||
g++ $(CXXFLAGS) -o $@ $< osd_peering_pg.o -ltcmalloc_minimal
|
||||
|
||||
libfio_sec_osd.so: fio_sec_osd.cpp osd_ops.h rw_blocking.o
|
||||
g++ $(CXXFLAGS) -ltcmalloc_minimal -shared -o libfio_sec_osd.so fio_sec_osd.cpp rw_blocking.o -luring
|
||||
|
|
18
osd.cpp
18
osd.cpp
|
@ -106,10 +106,8 @@ void osd_t::parse_config(blockstore_config_t & config)
|
|||
int major, minor;
|
||||
if (sscanf(config["etcd_version"].c_str(), "%d.%d", &major, &minor) < 2)
|
||||
throw std::runtime_error("etcd_version should be in the form MAJOR.MINOR (for example, 3.2)");
|
||||
if (major < 3 || major == 3 && minor < 2)
|
||||
throw std::runtime_error("Your etcd is too old, minimum required version is 3.2");
|
||||
else if (major == 3 && minor == 2)
|
||||
etcd_api_path = "/v3alpha";
|
||||
if (major < 3 || major == 3 && minor < 3)
|
||||
throw std::runtime_error("Your etcd is too old, minimum required version is 3.3");
|
||||
else if (major == 3 && minor == 3)
|
||||
etcd_api_path = "/v3beta";
|
||||
else
|
||||
|
@ -117,10 +115,6 @@ void osd_t::parse_config(blockstore_config_t & config)
|
|||
}
|
||||
else
|
||||
etcd_api_path = "/v3";
|
||||
if ((pos = etcd_address.find(':')) >= 0)
|
||||
etcd_host = etcd_address.substr(0, pos);
|
||||
else
|
||||
etcd_host = etcd_address;
|
||||
etcd_report_interval = strtoull(config["etcd_report_interval"].c_str(), NULL, 10);
|
||||
if (etcd_report_interval <= 0)
|
||||
etcd_report_interval = 30;
|
||||
|
@ -153,12 +147,10 @@ void osd_t::parse_config(blockstore_config_t & config)
|
|||
peer_connect_interval = strtoull(config["peer_connect_interval"].c_str(), NULL, 10);
|
||||
if (!peer_connect_interval)
|
||||
peer_connect_interval = 5;
|
||||
http_request_timeout = strtoull(config["http_request_timeout"].c_str(), NULL, 10);
|
||||
if (!http_request_timeout)
|
||||
http_request_timeout = 5;
|
||||
peer_connect_timeout = strtoull(config["peer_connect_timeout"].c_str(), NULL, 10);
|
||||
if (!peer_connect_timeout)
|
||||
peer_connect_timeout = 5;
|
||||
log_level = strtoull(config["log_level"].c_str(), NULL, 10);
|
||||
}
|
||||
|
||||
void osd_t::bind_socket()
|
||||
|
@ -394,13 +386,13 @@ void osd_t::stop_client(int peer_fd)
|
|||
if (cl.osd_num)
|
||||
{
|
||||
// Reload configuration from etcd when the connection is dropped
|
||||
printf("[%lu] Stopping client %d (OSD peer %lu)\n", osd_num, peer_fd, cl.osd_num);
|
||||
printf("[OSD %lu] Stopping client %d (OSD peer %lu)\n", osd_num, peer_fd, cl.osd_num);
|
||||
peer_states.erase(cl.osd_num);
|
||||
repeer_pgs(cl.osd_num);
|
||||
}
|
||||
else
|
||||
{
|
||||
printf("[%lu] Stopping client %d (regular client)\n", osd_num, peer_fd);
|
||||
printf("[OSD %lu] Stopping client %d (regular client)\n", osd_num, peer_fd);
|
||||
}
|
||||
}
|
||||
clients.erase(it);
|
||||
|
|
70
osd.h
70
osd.h
|
@ -18,6 +18,7 @@
|
|||
#include "timerfd_manager.h"
|
||||
#include "osd_ops.h"
|
||||
#include "osd_peering_pg.h"
|
||||
#include "osd_http.h"
|
||||
#include "json11/json11.hpp"
|
||||
|
||||
#define OSD_OP_IN 0
|
||||
|
@ -49,10 +50,6 @@
|
|||
#define MAX_RECOVERY_QUEUE 2048
|
||||
#define DEFAULT_RECOVERY_QUEUE 4
|
||||
|
||||
#define MAX_ETCD_ATTEMPTS 5
|
||||
#define ETCD_START_INTERVAL 5000
|
||||
#define ETCD_RETRY_INTERVAL 1000
|
||||
|
||||
//#define OSD_STUB
|
||||
|
||||
extern const char* osd_op_names[];
|
||||
|
@ -189,16 +186,30 @@ struct osd_wanted_peer_t
|
|||
int address_index;
|
||||
};
|
||||
|
||||
struct http_response_t;
|
||||
struct pg_config_t
|
||||
{
|
||||
bool exists;
|
||||
osd_num_t primary;
|
||||
std::vector<osd_num_t> target_set;
|
||||
std::vector<std::vector<osd_num_t>> target_history;
|
||||
bool pause;
|
||||
osd_num_t cur_primary;
|
||||
int cur_state;
|
||||
};
|
||||
|
||||
struct websocket_t;
|
||||
struct json_kv_t
|
||||
{
|
||||
std::string key;
|
||||
json11::Json value;
|
||||
};
|
||||
|
||||
class osd_t
|
||||
{
|
||||
// config
|
||||
|
||||
blockstore_config_t config;
|
||||
std::string etcd_address, etcd_host, etcd_prefix, etcd_api_path;
|
||||
// FIXME Allow multiple etcd addresses and select random address
|
||||
std::string etcd_address, etcd_prefix, etcd_api_path;
|
||||
int etcd_report_interval = 30;
|
||||
|
||||
bool readonly = false;
|
||||
|
@ -214,16 +225,26 @@ class osd_t
|
|||
int autosync_interval = DEFAULT_AUTOSYNC_INTERVAL; // sync every 5 seconds
|
||||
int recovery_queue_depth = DEFAULT_RECOVERY_QUEUE;
|
||||
int peer_connect_interval = 5;
|
||||
int http_request_timeout = 5;
|
||||
int peer_connect_timeout = 5;
|
||||
int log_level = 0;
|
||||
|
||||
// peer OSDs
|
||||
// cluster state
|
||||
|
||||
std::string etcd_lease_id, etcd_watch_revision;
|
||||
std::string etcd_lease_id;
|
||||
int etcd_watches_initialised = 0;
|
||||
uint64_t etcd_watch_revision = 0;
|
||||
websocket_t *etcd_watch_ws = NULL;
|
||||
std::map<osd_num_t, json11::Json> peer_states;
|
||||
std::map<osd_num_t, osd_wanted_peer_t> wanted_peers;
|
||||
bool loading_peer_config = false;
|
||||
int etcd_failed_attempts = 0;
|
||||
std::map<pg_num_t, pg_config_t> pg_config;
|
||||
std::set<pg_num_t> pg_state_dirty;
|
||||
bool pg_config_applied = false;
|
||||
bool etcd_reporting_pg_state = false;
|
||||
bool etcd_reporting_stats = false;
|
||||
|
||||
// peers and PGs
|
||||
|
||||
std::map<uint64_t, int> osd_peer_fds;
|
||||
std::map<pg_num_t, pg_t> pgs;
|
||||
|
@ -267,26 +288,36 @@ class osd_t
|
|||
uint64_t subop_stat_count[2][OSD_OP_MAX+1] = { 0 };
|
||||
|
||||
// cluster connection
|
||||
void http_request(std::string host, std::string request, bool streaming, std::function<void(const http_response_t *response)> callback);
|
||||
void http_request_json(std::string host, std::string request, std::function<void(std::string, json11::Json data)> callback);
|
||||
websocket_t* open_websocket(std::string host, std::string path, std::function<void(const http_response_t *msg)> callback);
|
||||
void etcd_call(std::string api, json11::Json payload, std::function<void(std::string, json11::Json)> callback);
|
||||
void etcd_txn(json11::Json txn, std::function<void(std::string, json11::Json)> callback);
|
||||
void http_request(const std::string & host, const std::string & request,
|
||||
const http_options_t & options, std::function<void(const http_response_t *response)> callback);
|
||||
void http_request_json(const std::string & host, const std::string & request, int timeout,
|
||||
std::function<void(std::string, json11::Json data)> callback);
|
||||
websocket_t* open_websocket(const std::string & host, const std::string & path, int timeout,
|
||||
std::function<void(const http_response_t *msg)> callback);
|
||||
void etcd_call(std::string api, json11::Json payload, int timeout, std::function<void(std::string, json11::Json)> callback);
|
||||
void etcd_txn(json11::Json txn, int timeout, std::function<void(std::string, json11::Json)> callback);
|
||||
json_kv_t parse_etcd_kv(const json11::Json & kv_json);
|
||||
void parse_config(blockstore_config_t & config);
|
||||
void init_cluster();
|
||||
void start_etcd_watcher();
|
||||
void load_global_config();
|
||||
void bind_socket();
|
||||
void acquire_lease();
|
||||
void create_state();
|
||||
json11::Json get_osd_state();
|
||||
void create_osd_state();
|
||||
void renew_lease();
|
||||
void print_stats();
|
||||
void reset_stats();
|
||||
json11::Json get_status();
|
||||
json11::Json get_statistics();
|
||||
void report_statistics();
|
||||
void report_pg_state(pg_t & pg);
|
||||
void report_pg_states();
|
||||
void load_pgs();
|
||||
void parse_pgs(const json11::Json & pg_config, const std::map<pg_num_t, json11::Json> & pg_history);
|
||||
void parse_pg_state(const std::string & key, const json11::Json & value);
|
||||
void apply_pg_count();
|
||||
void apply_pg_config();
|
||||
void load_and_connect_peers();
|
||||
void parse_etcd_osd_state(const std::string & key, const json11::Json & value);
|
||||
|
||||
// event loop, socket read/write
|
||||
void loop();
|
||||
|
@ -313,6 +344,7 @@ class osd_t
|
|||
void repeer_pgs(osd_num_t osd_num);
|
||||
void start_pg_peering(pg_num_t pg_num);
|
||||
void submit_list_subop(osd_num_t role_osd, pg_peering_state_t *ps);
|
||||
void discard_list_subop(osd_op_t *list_op);
|
||||
bool stop_pg(pg_num_t pg_num);
|
||||
void finish_stop_pg(pg_t & pg);
|
||||
|
||||
|
@ -356,7 +388,7 @@ class osd_t
|
|||
public:
|
||||
osd_t(blockstore_config_t & config, blockstore_t *bs, ring_loop_t *ringloop);
|
||||
~osd_t();
|
||||
void force_stop();
|
||||
void force_stop(int exitcode);
|
||||
bool shutdown();
|
||||
};
|
||||
|
||||
|
|
764
osd_cluster.cpp
764
osd_cluster.cpp
File diff suppressed because it is too large
Load Diff
|
@ -129,12 +129,16 @@ void osd_t::handle_flush_op(pg_num_t pg_num, pg_flush_batch_t *fb, osd_num_t osd
|
|||
if (!pg.flush_actions.size())
|
||||
{
|
||||
pg.state = pg.state & ~PG_HAS_UNCLEAN;
|
||||
pg.print_state();
|
||||
report_pg_state(pg);
|
||||
}
|
||||
for (osd_op_t *op: continue_ops)
|
||||
{
|
||||
continue_primary_write(op);
|
||||
}
|
||||
if (pg.inflight == 0 && (pg.state & PG_STOPPING))
|
||||
{
|
||||
finish_stop_pg(pg);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
37
osd_http.cpp
37
osd_http.cpp
|
@ -18,6 +18,7 @@ static std::string trim(const std::string & in);
|
|||
static std::string ws_format_frame(int type, uint64_t size);
|
||||
static bool ws_parse_frame(std::string & buf, int & type, std::string & res);
|
||||
|
||||
// FIXME: Use keepalive
|
||||
struct http_co_t
|
||||
{
|
||||
ring_loop_t *ringloop;
|
||||
|
@ -66,15 +67,19 @@ struct http_co_t
|
|||
#define HTTP_CO_WEBSOCKET 5
|
||||
#define HTTP_CO_CHUNKED 6
|
||||
|
||||
void osd_t::http_request(std::string host, std::string request, bool streaming, std::function<void(const http_response_t *response)> callback)
|
||||
#define DEFAULT_TIMEOUT 5000
|
||||
|
||||
// FIXME: Remove osd_t dependency from here
|
||||
void osd_t::http_request(const std::string & host, const std::string & request,
|
||||
const http_options_t & options, std::function<void(const http_response_t *response)> callback)
|
||||
{
|
||||
http_co_t *handler = new http_co_t();
|
||||
handler->ringloop = ringloop;
|
||||
handler->epoll_fd = epoll_fd;
|
||||
handler->epoll_handlers = &epoll_handlers;
|
||||
handler->request_timeout = http_request_timeout;
|
||||
handler->request_timeout = options.timeout < 0 ? 0 : (options.timeout == 0 ? DEFAULT_TIMEOUT : options.timeout);
|
||||
handler->want_streaming = options.want_streaming;
|
||||
handler->tfd = tfd;
|
||||
handler->want_streaming = streaming;
|
||||
handler->host = host;
|
||||
handler->request = request;
|
||||
handler->callback = callback;
|
||||
|
@ -82,10 +87,10 @@ void osd_t::http_request(std::string host, std::string request, bool streaming,
|
|||
handler->start_connection();
|
||||
}
|
||||
|
||||
void osd_t::http_request_json(std::string host, std::string request,
|
||||
std::function<void(std::string, json11::Json r)> callback)
|
||||
void osd_t::http_request_json(const std::string & host, const std::string & request,
|
||||
int timeout, std::function<void(std::string, json11::Json r)> callback)
|
||||
{
|
||||
http_request(host, request, false, [this, callback](const http_response_t* res)
|
||||
http_request(host, request, { .timeout = timeout }, [this, callback](const http_response_t* res)
|
||||
{
|
||||
if (res->error_code != 0)
|
||||
{
|
||||
|
@ -108,7 +113,8 @@ void osd_t::http_request_json(std::string host, std::string request,
|
|||
});
|
||||
}
|
||||
|
||||
websocket_t* osd_t::open_websocket(std::string host, std::string path, std::function<void(const http_response_t *msg)> callback)
|
||||
websocket_t* osd_t::open_websocket(const std::string & host, const std::string & path,
|
||||
int timeout, std::function<void(const http_response_t *msg)> callback)
|
||||
{
|
||||
std::string request = "GET "+path+" HTTP/1.1\r\n"
|
||||
"Host: "+host+"\r\n"
|
||||
|
@ -121,9 +127,9 @@ websocket_t* osd_t::open_websocket(std::string host, std::string path, std::func
|
|||
handler->ringloop = ringloop;
|
||||
handler->epoll_fd = epoll_fd;
|
||||
handler->epoll_handlers = &epoll_handlers;
|
||||
handler->request_timeout = http_request_timeout;
|
||||
handler->tfd = tfd;
|
||||
handler->request_timeout = timeout < 0 ? -1 : (timeout == 0 ? DEFAULT_TIMEOUT : timeout);
|
||||
handler->want_streaming = false;
|
||||
handler->tfd = tfd;
|
||||
handler->host = host;
|
||||
handler->request = request;
|
||||
handler->callback = callback;
|
||||
|
@ -215,11 +221,11 @@ void http_co_t::start_connection()
|
|||
fcntl(peer_fd, F_SETFL, fcntl(peer_fd, F_GETFL, 0) | O_NONBLOCK);
|
||||
if (request_timeout > 0)
|
||||
{
|
||||
timeout_id = tfd->set_timer(1000*request_timeout, false, [this](int timer_id)
|
||||
timeout_id = tfd->set_timer(request_timeout, false, [this](int timer_id)
|
||||
{
|
||||
if (response.length() == 0)
|
||||
{
|
||||
parsed.error_code = EIO;
|
||||
parsed.error_code = ETIME;
|
||||
}
|
||||
delete this;
|
||||
});
|
||||
|
@ -440,8 +446,8 @@ void http_co_t::handle_read()
|
|||
if (!len)
|
||||
{
|
||||
// Zero length chunk indicates EOF
|
||||
delete this;
|
||||
return;
|
||||
parsed.eof = true;
|
||||
break;
|
||||
}
|
||||
if (response.size() < pos+2+len+2)
|
||||
{
|
||||
|
@ -454,6 +460,11 @@ void http_co_t::handle_read()
|
|||
{
|
||||
response = response.substr(prev);
|
||||
}
|
||||
if (parsed.eof)
|
||||
{
|
||||
delete this;
|
||||
return;
|
||||
}
|
||||
if (want_streaming && parsed.body.size() > 0)
|
||||
{
|
||||
callback(&parsed);
|
||||
|
|
|
@ -10,6 +10,12 @@
|
|||
#define WS_PING 9
|
||||
#define WS_PONG 10
|
||||
|
||||
struct http_options_t
|
||||
{
|
||||
int timeout;
|
||||
bool want_streaming;
|
||||
};
|
||||
|
||||
struct http_response_t
|
||||
{
|
||||
bool eof = false;
|
||||
|
|
|
@ -10,7 +10,7 @@ static void handle_sigint(int sig)
|
|||
if (osd && !force_stopping)
|
||||
{
|
||||
force_stopping = true;
|
||||
osd->force_stop();
|
||||
osd->force_stop(0);
|
||||
return;
|
||||
}
|
||||
exit(0);
|
||||
|
|
|
@ -168,12 +168,13 @@ void osd_t::handle_peers()
|
|||
{
|
||||
if (!p.second.peering_state->list_ops.size())
|
||||
{
|
||||
p.second.calc_object_states();
|
||||
p.second.calc_object_states(log_level);
|
||||
report_pg_state(p.second);
|
||||
incomplete_objects += p.second.incomplete_objects.size();
|
||||
misplaced_objects += p.second.misplaced_objects.size();
|
||||
// FIXME: degraded objects may currently include misplaced, too! Report them separately?
|
||||
degraded_objects += p.second.degraded_objects.size();
|
||||
if (p.second.state & PG_HAS_UNCLEAN)
|
||||
if (p.second.state & (PG_ACTIVE | PG_HAS_UNCLEAN) == (PG_ACTIVE | PG_HAS_UNCLEAN))
|
||||
peering_state = peering_state | OSD_FLUSHING_PGS;
|
||||
else
|
||||
peering_state = peering_state | OSD_RECOVERING;
|
||||
|
@ -195,7 +196,7 @@ void osd_t::handle_peers()
|
|||
bool still = false;
|
||||
for (auto & p: pgs)
|
||||
{
|
||||
if (p.second.state & PG_HAS_UNCLEAN)
|
||||
if (p.second.state & (PG_ACTIVE | PG_HAS_UNCLEAN) == (PG_ACTIVE | PG_HAS_UNCLEAN))
|
||||
{
|
||||
if (!p.second.flush_batch)
|
||||
{
|
||||
|
@ -224,7 +225,7 @@ void osd_t::repeer_pgs(osd_num_t peer_osd)
|
|||
for (auto & p: pgs)
|
||||
{
|
||||
bool repeer = false;
|
||||
if (p.second.state != PG_OFFLINE)
|
||||
if (p.second.state & (PG_PEERING | PG_ACTIVE | PG_INCOMPLETE))
|
||||
{
|
||||
for (osd_num_t pg_osd: p.second.all_peers)
|
||||
{
|
||||
|
@ -239,7 +240,6 @@ void osd_t::repeer_pgs(osd_num_t peer_osd)
|
|||
// Repeer this pg
|
||||
printf("[PG %u] Repeer because of OSD %lu\n", p.second.pg_num, peer_osd);
|
||||
start_pg_peering(p.second.pg_num);
|
||||
peering_state |= OSD_PEERING_PGS;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -250,7 +250,8 @@ void osd_t::start_pg_peering(pg_num_t pg_num)
|
|||
{
|
||||
auto & pg = pgs[pg_num];
|
||||
pg.state = PG_PEERING;
|
||||
pg.print_state();
|
||||
this->peering_state |= OSD_PEERING_PGS;
|
||||
report_pg_state(pg);
|
||||
// Reset PG state
|
||||
pg.state_dict.clear();
|
||||
incomplete_objects -= pg.incomplete_objects.size();
|
||||
|
@ -312,14 +313,14 @@ void osd_t::start_pg_peering(pg_num_t pg_num)
|
|||
if (!found)
|
||||
{
|
||||
pg.state = PG_INCOMPLETE;
|
||||
pg.print_state();
|
||||
report_pg_state(pg);
|
||||
}
|
||||
}
|
||||
}
|
||||
if (pg.pg_cursize < pg.pg_minsize)
|
||||
{
|
||||
pg.state = PG_INCOMPLETE;
|
||||
pg.print_state();
|
||||
report_pg_state(pg);
|
||||
}
|
||||
std::set<osd_num_t> cur_peers;
|
||||
for (auto peer_osd: pg.all_peers)
|
||||
|
@ -342,25 +343,7 @@ void osd_t::start_pg_peering(pg_num_t pg_num)
|
|||
if (pg.state == PG_INCOMPLETE || cur_peers.find(it->first) == cur_peers.end())
|
||||
{
|
||||
// Discard the result after completion, which, chances are, will be unsuccessful
|
||||
auto list_op = it->second;
|
||||
if (list_op->peer_fd == 0)
|
||||
{
|
||||
// Self
|
||||
list_op->bs_op->callback = [list_op](blockstore_op_t *bs_op)
|
||||
{
|
||||
if (list_op->bs_op->buf)
|
||||
free(list_op->bs_op->buf);
|
||||
delete list_op;
|
||||
};
|
||||
}
|
||||
else
|
||||
{
|
||||
// Peer
|
||||
list_op->callback = [](osd_op_t *list_op)
|
||||
{
|
||||
delete list_op;
|
||||
};
|
||||
}
|
||||
discard_list_subop(it->second);
|
||||
pg.peering_state->list_ops.erase(it);
|
||||
it = pg.peering_state->list_ops.begin();
|
||||
}
|
||||
|
@ -491,6 +474,28 @@ void osd_t::submit_list_subop(osd_num_t role_osd, pg_peering_state_t *ps)
|
|||
}
|
||||
}
|
||||
|
||||
void osd_t::discard_list_subop(osd_op_t *list_op)
|
||||
{
|
||||
if (list_op->peer_fd == 0)
|
||||
{
|
||||
// Self
|
||||
list_op->bs_op->callback = [list_op](blockstore_op_t *bs_op)
|
||||
{
|
||||
if (list_op->bs_op->buf)
|
||||
free(list_op->bs_op->buf);
|
||||
delete list_op;
|
||||
};
|
||||
}
|
||||
else
|
||||
{
|
||||
// Peer
|
||||
list_op->callback = [](osd_op_t *list_op)
|
||||
{
|
||||
delete list_op;
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
bool osd_t::stop_pg(pg_num_t pg_num)
|
||||
{
|
||||
auto pg_it = pgs.find(pg_num);
|
||||
|
@ -499,19 +504,52 @@ bool osd_t::stop_pg(pg_num_t pg_num)
|
|||
return false;
|
||||
}
|
||||
auto & pg = pg_it->second;
|
||||
if (pg.peering_state)
|
||||
{
|
||||
// Stop peering
|
||||
for (auto it = pg.peering_state->list_ops.begin(); it != pg.peering_state->list_ops.end();)
|
||||
{
|
||||
discard_list_subop(it->second);
|
||||
}
|
||||
for (auto it = pg.peering_state->list_results.begin(); it != pg.peering_state->list_results.end();)
|
||||
{
|
||||
if (it->second.buf)
|
||||
{
|
||||
free(it->second.buf);
|
||||
}
|
||||
}
|
||||
delete pg.peering_state;
|
||||
pg.peering_state = NULL;
|
||||
}
|
||||
if (!(pg.state & PG_ACTIVE))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
pg.state = pg.state & ~PG_ACTIVE | PG_STOPPING;
|
||||
if (pg.inflight == 0)
|
||||
if (pg.inflight == 0 && !pg.flush_batch)
|
||||
{
|
||||
finish_stop_pg(pg);
|
||||
}
|
||||
else
|
||||
{
|
||||
report_pg_state(pg);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
void osd_t::finish_stop_pg(pg_t & pg)
|
||||
{
|
||||
pg.state = PG_OFFLINE;
|
||||
report_pg_state(pg);
|
||||
}
|
||||
|
||||
void osd_t::report_pg_state(pg_t & pg)
|
||||
{
|
||||
pg.print_state();
|
||||
this->pg_state_dirty.insert(pg.pg_num);
|
||||
if (pg.state == PG_OFFLINE && !this->pg_config_applied)
|
||||
{
|
||||
apply_pg_config();
|
||||
}
|
||||
report_pg_states();
|
||||
}
|
||||
|
|
|
@ -43,6 +43,7 @@ struct pg_obj_state_check_t
|
|||
uint64_t n_copies = 0, has_roles = 0, n_roles = 0, n_stable = 0, n_mismatched = 0;
|
||||
uint64_t n_unstable = 0, n_buggy = 0;
|
||||
pg_osd_set_t osd_set;
|
||||
int log_level;
|
||||
|
||||
void walk();
|
||||
void start_object();
|
||||
|
@ -198,14 +199,16 @@ void pg_obj_state_check_t::finish_object()
|
|||
}
|
||||
if (n_roles < pg->pg_minsize)
|
||||
{
|
||||
printf("Object is incomplete: inode=%lu stripe=%lu version=%lu/%lu\n", oid.inode, oid.stripe, target_ver, max_ver);
|
||||
for (int i = ver_start; i < ver_end; i++)
|
||||
if (log_level > 1)
|
||||
{
|
||||
printf("Present on: osd %lu, role %ld%s\n", list[i].osd_num, (list[i].oid.stripe & STRIPE_MASK), list[i].is_stable ? " (stable)" : "");
|
||||
printf("Object is incomplete: inode=%lu stripe=%lu version=%lu/%lu\n", oid.inode, oid.stripe, target_ver, max_ver);
|
||||
for (int i = ver_start; i < ver_end; i++)
|
||||
{
|
||||
printf("Present on: osd %lu, role %ld%s\n", list[i].osd_num, (list[i].oid.stripe & STRIPE_MASK), list[i].is_stable ? " (stable)" : "");
|
||||
}
|
||||
}
|
||||
if (0)
|
||||
if (log_level > 2)
|
||||
{
|
||||
// For future debug level
|
||||
for (int i = obj_start; i < obj_end; i++)
|
||||
{
|
||||
printf("v%lu present on: osd %lu, role %ld%s\n", list[i].version, list[i].osd_num, (list[i].oid.stripe & STRIPE_MASK), list[i].is_stable ? " (stable)" : "");
|
||||
|
@ -216,10 +219,13 @@ void pg_obj_state_check_t::finish_object()
|
|||
}
|
||||
else if (n_roles < pg->pg_cursize)
|
||||
{
|
||||
printf("Object is degraded: inode=%lu stripe=%lu version=%lu/%lu\n", oid.inode, oid.stripe, target_ver, max_ver);
|
||||
for (int i = ver_start; i < ver_end; i++)
|
||||
if (log_level > 1)
|
||||
{
|
||||
printf("Present on: osd %lu, role %ld%s\n", list[i].osd_num, (list[i].oid.stripe & STRIPE_MASK), list[i].is_stable ? " (stable)" : "");
|
||||
printf("Object is degraded: inode=%lu stripe=%lu version=%lu/%lu\n", oid.inode, oid.stripe, target_ver, max_ver);
|
||||
for (int i = ver_start; i < ver_end; i++)
|
||||
{
|
||||
printf("Present on: osd %lu, role %ld%s\n", list[i].osd_num, (list[i].oid.stripe & STRIPE_MASK), list[i].is_stable ? " (stable)" : "");
|
||||
}
|
||||
}
|
||||
state = OBJ_DEGRADED;
|
||||
pg->state = pg->state | PG_HAS_DEGRADED;
|
||||
|
@ -321,10 +327,11 @@ void pg_obj_state_check_t::finish_object()
|
|||
}
|
||||
|
||||
// FIXME: Write at least some tests for this function
|
||||
void pg_t::calc_object_states()
|
||||
void pg_t::calc_object_states(int log_level)
|
||||
{
|
||||
// Copy all object lists into one array
|
||||
pg_obj_state_check_t st;
|
||||
st.log_level = log_level;
|
||||
st.pg = this;
|
||||
auto ps = peering_state;
|
||||
for (auto it: ps->list_results)
|
||||
|
@ -352,12 +359,10 @@ void pg_t::calc_object_states()
|
|||
std::sort(st.list.begin(), st.list.end());
|
||||
// Walk over it and check object states
|
||||
st.walk();
|
||||
print_state();
|
||||
}
|
||||
|
||||
void pg_t::print_state()
|
||||
{
|
||||
// FIXME Immediately report state on each change
|
||||
printf(
|
||||
"[PG %u] is %s%s%s%s%s%s%s%s%s (%lu objects)\n", pg_num,
|
||||
(state & PG_OFFLINE) ? "offline" : "",
|
||||
|
@ -373,13 +378,15 @@ void pg_t::print_state()
|
|||
);
|
||||
}
|
||||
|
||||
const int pg_state_bit_count = 10;
|
||||
const int pg_state_bit_count = 12;
|
||||
|
||||
const int pg_state_bits[10] = {
|
||||
PG_OFFLINE,
|
||||
const int pg_state_bits[12] = {
|
||||
PG_STARTING,
|
||||
PG_PEERING,
|
||||
PG_INCOMPLETE,
|
||||
PG_ACTIVE,
|
||||
PG_STOPPING,
|
||||
PG_OFFLINE,
|
||||
PG_DEGRADED,
|
||||
PG_HAS_INCOMPLETE,
|
||||
PG_HAS_DEGRADED,
|
||||
|
@ -387,11 +394,13 @@ const int pg_state_bits[10] = {
|
|||
PG_HAS_UNCLEAN,
|
||||
};
|
||||
|
||||
const char *pg_state_names[10] = {
|
||||
"offline",
|
||||
const char *pg_state_names[12] = {
|
||||
"starting",
|
||||
"peering",
|
||||
"incomplete",
|
||||
"active",
|
||||
"stopping",
|
||||
"offline",
|
||||
"degraded",
|
||||
"has_incomplete",
|
||||
"has_degraded",
|
||||
|
|
|
@ -9,18 +9,20 @@
|
|||
#include "osd_ops.h"
|
||||
|
||||
// Placement group states
|
||||
// STARTING -> [acquire lock] -> PEERING -> INCOMPLETE|ACTIVE -> STOPPING -> OFFLINE -> [release lock]
|
||||
// Exactly one of these:
|
||||
#define PG_OFFLINE (1<<0)
|
||||
#define PG_STARTING (1<<0)
|
||||
#define PG_PEERING (1<<1)
|
||||
#define PG_INCOMPLETE (1<<2)
|
||||
#define PG_ACTIVE (1<<3)
|
||||
#define PG_STOPPING (1<<4)
|
||||
#define PG_OFFLINE (1<<5)
|
||||
// Plus any of these:
|
||||
#define PG_DEGRADED (1<<5)
|
||||
#define PG_HAS_INCOMPLETE (1<<6)
|
||||
#define PG_HAS_DEGRADED (1<<7)
|
||||
#define PG_HAS_MISPLACED (1<<8)
|
||||
#define PG_HAS_UNCLEAN (1<<9)
|
||||
#define PG_DEGRADED (1<<6)
|
||||
#define PG_HAS_INCOMPLETE (1<<7)
|
||||
#define PG_HAS_DEGRADED (1<<8)
|
||||
#define PG_HAS_MISPLACED (1<<9)
|
||||
#define PG_HAS_UNCLEAN (1<<10)
|
||||
|
||||
// FIXME: Safe default that doesn't depend on pg_stripe_size or pg_block_size
|
||||
#define STRIPE_MASK ((uint64_t)4096 - 1)
|
||||
|
@ -33,8 +35,8 @@
|
|||
#define OBJ_NEEDS_ROLLBACK 0x20000
|
||||
#define OBJ_BUGGY 0x80000
|
||||
|
||||
extern const int pg_state_bits[10];
|
||||
extern const char *pg_state_names[10];
|
||||
extern const int pg_state_bits[];
|
||||
extern const char *pg_state_names[];
|
||||
extern const int pg_state_bit_count;
|
||||
|
||||
struct pg_obj_loc_t
|
||||
|
@ -96,7 +98,7 @@ struct pg_flush_batch_t
|
|||
|
||||
struct pg_t
|
||||
{
|
||||
int state = PG_OFFLINE;
|
||||
int state = 0;
|
||||
uint64_t pg_cursize = 3, pg_size = 3, pg_minsize = 2;
|
||||
pg_num_t pg_num;
|
||||
uint64_t clean_count = 0, total_count = 0;
|
||||
|
@ -122,7 +124,7 @@ struct pg_t
|
|||
int inflight = 0; // including write_queue
|
||||
std::multimap<object_id, osd_op_t*> write_queue;
|
||||
|
||||
void calc_object_states();
|
||||
void calc_object_states(int log_level);
|
||||
void print_state();
|
||||
};
|
||||
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
#define _LARGEFILE64_SOURCE
|
||||
|
||||
#include "osd_peering_pg.h"
|
||||
#define STRIPE_SHIFT 12
|
||||
|
||||
/**
|
||||
* TODO tests for object & pg state calculation.
|
||||
|
@ -43,7 +44,7 @@ int main(int argc, char *argv[])
|
|||
}
|
||||
pg.peering_state->list_results[osd_num] = r;
|
||||
}
|
||||
pg.calc_object_states();
|
||||
pg.calc_object_states(0);
|
||||
printf("deviation variants=%ld clean=%lu\n", pg.state_dict.size(), pg.clean_count);
|
||||
for (auto it: pg.state_dict)
|
||||
{
|
||||
|
|
|
@ -51,7 +51,7 @@ void osd_t::finish_op(osd_op_t *cur_op, int retval)
|
|||
auto & pg = pgs[cur_op->op_data->pg_num];
|
||||
int n = --pg.inflight;
|
||||
assert(n >= 0);
|
||||
if (n == 0 && (pg.state & PG_STOPPING))
|
||||
if ((pg.state & PG_STOPPING) && n == 0 && !pg.flush_batch)
|
||||
{
|
||||
finish_stop_pg(pg);
|
||||
}
|
||||
|
@ -513,7 +513,7 @@ resume_8:
|
|||
if (!pg.incomplete_objects.size())
|
||||
{
|
||||
pg.state = pg.state & ~PG_HAS_INCOMPLETE;
|
||||
pg.print_state();
|
||||
report_pg_state(pg);
|
||||
}
|
||||
}
|
||||
else if (op_data->object_state->state & OBJ_DEGRADED)
|
||||
|
@ -523,7 +523,7 @@ resume_8:
|
|||
if (!pg.degraded_objects.size())
|
||||
{
|
||||
pg.state = pg.state & ~PG_HAS_DEGRADED;
|
||||
pg.print_state();
|
||||
report_pg_state(pg);
|
||||
}
|
||||
}
|
||||
else if (op_data->object_state->state & OBJ_MISPLACED)
|
||||
|
@ -533,7 +533,7 @@ resume_8:
|
|||
if (!pg.misplaced_objects.size())
|
||||
{
|
||||
pg.state = pg.state & ~PG_HAS_MISPLACED;
|
||||
pg.print_state();
|
||||
report_pg_state(pg);
|
||||
}
|
||||
}
|
||||
else
|
||||
|
|
Loading…
Reference in New Issue