Fix client connection recovery bugs, add dirty_ops limit
parent
68567c0e1f
commit
f6d705383a
|
@ -5,6 +5,9 @@
|
||||||
#include <assert.h>
|
#include <assert.h>
|
||||||
#include "cluster_client.h"
|
#include "cluster_client.h"
|
||||||
|
|
||||||
|
#define PART_SENT 1
|
||||||
|
#define PART_DONE 2
|
||||||
|
#define PART_ERROR 4
|
||||||
#define CACHE_DIRTY 1
|
#define CACHE_DIRTY 1
|
||||||
#define CACHE_FLUSHING 2
|
#define CACHE_FLUSHING 2
|
||||||
#define CACHE_REPEATING 4
|
#define CACHE_REPEATING 4
|
||||||
|
@ -30,7 +33,6 @@ cluster_client_t::cluster_client_t(ring_loop_t *ringloop, timerfd_manager_t *tfd
|
||||||
{
|
{
|
||||||
// peer_osd just dropped connection
|
// peer_osd just dropped connection
|
||||||
// determine WHICH dirty_buffers are now obsolete and repeat them
|
// determine WHICH dirty_buffers are now obsolete and repeat them
|
||||||
dirty_osds.erase(peer_osd);
|
|
||||||
for (auto & wr: dirty_buffers)
|
for (auto & wr: dirty_buffers)
|
||||||
{
|
{
|
||||||
if (affects_osd(wr.first.inode, wr.first.stripe, wr.second.len, peer_osd) &&
|
if (affects_osd(wr.first.inode, wr.first.stripe, wr.second.len, peer_osd) &&
|
||||||
|
@ -97,48 +99,41 @@ void cluster_client_t::continue_ops(bool up_retry)
|
||||||
int j = 0;
|
int j = 0;
|
||||||
for (int i = 0; i < op_queue.size(); i++)
|
for (int i = 0; i < op_queue.size(); i++)
|
||||||
{
|
{
|
||||||
bool rm = false;
|
bool rm = false, is_flush = op_queue[i]->flags & OP_FLUSH_BUFFER;
|
||||||
|
auto opcode = op_queue[i]->opcode;
|
||||||
if (!op_queue[i]->up_wait || up_retry)
|
if (!op_queue[i]->up_wait || up_retry)
|
||||||
{
|
{
|
||||||
op_queue[i]->up_wait = false;
|
op_queue[i]->up_wait = false;
|
||||||
if (op_queue[i]->opcode == OSD_OP_READ)
|
if (opcode == OSD_OP_READ || opcode == OSD_OP_WRITE)
|
||||||
{
|
{
|
||||||
rm = continue_rw(op_queue[i]);
|
if (is_flush || !has_flushes)
|
||||||
}
|
|
||||||
else if (op_queue[i]->opcode == OSD_OP_WRITE)
|
|
||||||
{
|
|
||||||
if (op_queue[i]->flags & OP_FLUSH_BUFFER)
|
|
||||||
{
|
|
||||||
rm = continue_rw(op_queue[i]);
|
|
||||||
if (!rm)
|
|
||||||
{
|
{
|
||||||
// Regular writes can't proceed before buffer flushes
|
// Regular writes can't proceed before buffer flushes
|
||||||
has_flushes = true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else if (!has_flushes)
|
|
||||||
{
|
|
||||||
rm = continue_rw(op_queue[i]);
|
rm = continue_rw(op_queue[i]);
|
||||||
}
|
}
|
||||||
if (!rm)
|
|
||||||
{
|
|
||||||
has_writes = true;
|
|
||||||
}
|
}
|
||||||
}
|
else if (opcode == OSD_OP_SYNC)
|
||||||
else if (op_queue[i]->opcode == OSD_OP_SYNC)
|
|
||||||
{
|
{
|
||||||
if (!has_writes)
|
if (!has_writes)
|
||||||
{
|
{
|
||||||
// SYNC can't proceed before previous writes
|
// SYNC can't proceed before previous writes
|
||||||
rm = continue_sync(op_queue[i]);
|
rm = continue_sync(op_queue[i]);
|
||||||
if (!rm)
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (opcode == OSD_OP_WRITE)
|
||||||
|
{
|
||||||
|
has_writes = has_writes || !rm;
|
||||||
|
if (is_flush)
|
||||||
|
{
|
||||||
|
has_flushes = has_writes || !rm;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else if (opcode == OSD_OP_SYNC)
|
||||||
{
|
{
|
||||||
// Postpone writes until previous SYNC completes
|
// Postpone writes until previous SYNC completes
|
||||||
// ...so dirty_writes can't contain anything newer than SYNC
|
// ...so dirty_writes can't contain anything newer than SYNC
|
||||||
has_flushes = true;
|
has_flushes = has_writes || !rm;
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
if (!rm)
|
if (!rm)
|
||||||
{
|
{
|
||||||
|
@ -185,13 +180,26 @@ void cluster_client_t::on_load_config_hook(json11::Json::object & config)
|
||||||
// Cluster-wide immediate_commit mode
|
// Cluster-wide immediate_commit mode
|
||||||
immediate_commit = true;
|
immediate_commit = true;
|
||||||
}
|
}
|
||||||
|
if (config.find("client_max_dirty_bytes") != config.end())
|
||||||
|
{
|
||||||
|
client_max_dirty_bytes = config["client_max_dirty_bytes"].uint64_value();
|
||||||
|
}
|
||||||
else if (config.find("client_dirty_limit") != config.end())
|
else if (config.find("client_dirty_limit") != config.end())
|
||||||
{
|
{
|
||||||
client_dirty_limit = config["client_dirty_limit"].uint64_value();
|
// Old name
|
||||||
|
client_max_dirty_bytes = config["client_dirty_limit"].uint64_value();
|
||||||
}
|
}
|
||||||
if (!client_dirty_limit)
|
if (config.find("client_max_dirty_ops") != config.end())
|
||||||
{
|
{
|
||||||
client_dirty_limit = DEFAULT_CLIENT_DIRTY_LIMIT;
|
client_max_dirty_ops = config["client_max_dirty_ops"].uint64_value();
|
||||||
|
}
|
||||||
|
if (!client_max_dirty_bytes)
|
||||||
|
{
|
||||||
|
client_max_dirty_bytes = DEFAULT_CLIENT_MAX_DIRTY_BYTES;
|
||||||
|
}
|
||||||
|
if (!client_max_dirty_ops)
|
||||||
|
{
|
||||||
|
client_max_dirty_ops = DEFAULT_CLIENT_MAX_DIRTY_OPS;
|
||||||
}
|
}
|
||||||
up_wait_retry_interval = config["up_wait_retry_interval"].uint64_value();
|
up_wait_retry_interval = config["up_wait_retry_interval"].uint64_value();
|
||||||
if (!up_wait_retry_interval)
|
if (!up_wait_retry_interval)
|
||||||
|
@ -306,7 +314,7 @@ void cluster_client_t::execute(cluster_op_t *op)
|
||||||
op->retval = 0;
|
op->retval = 0;
|
||||||
if (op->opcode == OSD_OP_WRITE && !immediate_commit)
|
if (op->opcode == OSD_OP_WRITE && !immediate_commit)
|
||||||
{
|
{
|
||||||
if (dirty_bytes >= client_dirty_limit)
|
if (dirty_bytes >= client_max_dirty_bytes || dirty_ops >= client_max_dirty_ops)
|
||||||
{
|
{
|
||||||
// Push an extra SYNC operation to flush previous writes
|
// Push an extra SYNC operation to flush previous writes
|
||||||
cluster_op_t *sync_op = new cluster_op_t;
|
cluster_op_t *sync_op = new cluster_op_t;
|
||||||
|
@ -317,12 +325,15 @@ void cluster_client_t::execute(cluster_op_t *op)
|
||||||
};
|
};
|
||||||
op_queue.push_back(sync_op);
|
op_queue.push_back(sync_op);
|
||||||
dirty_bytes = 0;
|
dirty_bytes = 0;
|
||||||
|
dirty_ops = 0;
|
||||||
}
|
}
|
||||||
dirty_bytes += op->len;
|
dirty_bytes += op->len;
|
||||||
|
dirty_ops++;
|
||||||
}
|
}
|
||||||
else if (op->opcode == OSD_OP_SYNC)
|
else if (op->opcode == OSD_OP_SYNC)
|
||||||
{
|
{
|
||||||
dirty_bytes = 0;
|
dirty_bytes = 0;
|
||||||
|
dirty_ops = 0;
|
||||||
}
|
}
|
||||||
op_queue.push_back(op);
|
op_queue.push_back(op);
|
||||||
continue_ops();
|
continue_ops();
|
||||||
|
@ -457,7 +468,7 @@ resume_0:
|
||||||
}
|
}
|
||||||
if (op->opcode == OSD_OP_WRITE)
|
if (op->opcode == OSD_OP_WRITE)
|
||||||
{
|
{
|
||||||
if (!immediate_commit)
|
if (!immediate_commit && !(op->flags & OP_FLUSH_BUFFER))
|
||||||
{
|
{
|
||||||
copy_write(op, dirty_buffers);
|
copy_write(op, dirty_buffers);
|
||||||
}
|
}
|
||||||
|
@ -469,13 +480,33 @@ resume_1:
|
||||||
resume_2:
|
resume_2:
|
||||||
// Send unsent parts, if they're not subject to change
|
// Send unsent parts, if they're not subject to change
|
||||||
op->state = 3;
|
op->state = 3;
|
||||||
|
if (op->needs_reslice)
|
||||||
|
{
|
||||||
for (int i = 0; i < op->parts.size(); i++)
|
for (int i = 0; i < op->parts.size(); i++)
|
||||||
{
|
{
|
||||||
if (!op->parts[i].sent && !op->parts[i].done)
|
if (!(op->parts[i].flags & PART_SENT) && op->retval)
|
||||||
|
{
|
||||||
|
op->retval = -EPIPE;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
goto resume_3;
|
||||||
|
}
|
||||||
|
for (int i = 0; i < op->parts.size(); i++)
|
||||||
|
{
|
||||||
|
if (!(op->parts[i].flags & PART_SENT))
|
||||||
{
|
{
|
||||||
if (!try_send(op, i))
|
if (!try_send(op, i))
|
||||||
{
|
{
|
||||||
// We'll need to retry again
|
// We'll need to retry again
|
||||||
|
op->up_wait = true;
|
||||||
|
if (!retry_timeout_id)
|
||||||
|
{
|
||||||
|
retry_timeout_id = tfd->set_timer(up_wait_retry_interval, false, [this](int)
|
||||||
|
{
|
||||||
|
retry_timeout_id = 0;
|
||||||
|
continue_ops(true);
|
||||||
|
});
|
||||||
|
}
|
||||||
op->state = 2;
|
op->state = 2;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -485,7 +516,7 @@ resume_2:
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
resume_3:
|
resume_3:
|
||||||
if (op->sent_count > 0)
|
if (op->inflight_count > 0)
|
||||||
{
|
{
|
||||||
op->state = 3;
|
op->state = 3;
|
||||||
return 0;
|
return 0;
|
||||||
|
@ -517,6 +548,10 @@ resume_3:
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
|
for (int i = 0; i < op->parts.size(); i++)
|
||||||
|
{
|
||||||
|
op->parts[i].flags = 0;
|
||||||
|
}
|
||||||
goto resume_2;
|
goto resume_2;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -548,8 +583,7 @@ void cluster_client_t::slice_rw(cluster_op_t *op)
|
||||||
.offset = begin,
|
.offset = begin,
|
||||||
.len = (uint32_t)(end - begin),
|
.len = (uint32_t)(end - begin),
|
||||||
.pg_num = pg_num,
|
.pg_num = pg_num,
|
||||||
.sent = false,
|
.flags = 0,
|
||||||
.done = false,
|
|
||||||
};
|
};
|
||||||
int left = end-begin;
|
int left = end-begin;
|
||||||
while (left > 0 && iov_idx < op->iov.count)
|
while (left > 0 && iov_idx < op->iov.count)
|
||||||
|
@ -606,8 +640,8 @@ bool cluster_client_t::try_send(cluster_op_t *op, int i)
|
||||||
{
|
{
|
||||||
int peer_fd = peer_it->second;
|
int peer_fd = peer_it->second;
|
||||||
part->osd_num = primary_osd;
|
part->osd_num = primary_osd;
|
||||||
part->sent = true;
|
part->flags |= PART_SENT;
|
||||||
op->sent_count++;
|
op->inflight_count++;
|
||||||
part->op = (osd_op_t){
|
part->op = (osd_op_t){
|
||||||
.op_type = OSD_OP_OUT,
|
.op_type = OSD_OP_OUT,
|
||||||
.peer_fd = peer_fd,
|
.peer_fd = peer_fd,
|
||||||
|
@ -675,8 +709,7 @@ int cluster_client_t::continue_sync(cluster_op_t *op)
|
||||||
op->parts[i] = {
|
op->parts[i] = {
|
||||||
.parent = op,
|
.parent = op,
|
||||||
.osd_num = sync_osd,
|
.osd_num = sync_osd,
|
||||||
.sent = false,
|
.flags = 0,
|
||||||
.done = false,
|
|
||||||
};
|
};
|
||||||
send_sync(op, &op->parts[i]);
|
send_sync(op, &op->parts[i]);
|
||||||
i++;
|
i++;
|
||||||
|
@ -684,7 +717,7 @@ int cluster_client_t::continue_sync(cluster_op_t *op)
|
||||||
}
|
}
|
||||||
dirty_osds.clear();
|
dirty_osds.clear();
|
||||||
resume_1:
|
resume_1:
|
||||||
if (op->sent_count > 0)
|
if (op->inflight_count > 0)
|
||||||
{
|
{
|
||||||
op->state = 1;
|
op->state = 1;
|
||||||
return 0;
|
return 0;
|
||||||
|
@ -703,7 +736,7 @@ resume_1:
|
||||||
// Retry later
|
// Retry later
|
||||||
op->parts.clear();
|
op->parts.clear();
|
||||||
op->retval = 0;
|
op->retval = 0;
|
||||||
op->sent_count = 0;
|
op->inflight_count = 0;
|
||||||
op->done_count = 0;
|
op->done_count = 0;
|
||||||
op->state = 0;
|
op->state = 0;
|
||||||
return 0;
|
return 0;
|
||||||
|
@ -730,8 +763,8 @@ void cluster_client_t::send_sync(cluster_op_t *op, cluster_op_part_t *part)
|
||||||
{
|
{
|
||||||
auto peer_it = msgr.osd_peer_fds.find(part->osd_num);
|
auto peer_it = msgr.osd_peer_fds.find(part->osd_num);
|
||||||
assert(peer_it != msgr.osd_peer_fds.end());
|
assert(peer_it != msgr.osd_peer_fds.end());
|
||||||
part->sent = true;
|
part->flags |= PART_SENT;
|
||||||
op->sent_count++;
|
op->inflight_count++;
|
||||||
part->op = (osd_op_t){
|
part->op = (osd_op_t){
|
||||||
.op_type = OSD_OP_OUT,
|
.op_type = OSD_OP_OUT,
|
||||||
.peer_fd = peer_it->second,
|
.peer_fd = peer_it->second,
|
||||||
|
@ -753,8 +786,7 @@ void cluster_client_t::send_sync(cluster_op_t *op, cluster_op_part_t *part)
|
||||||
void cluster_client_t::handle_op_part(cluster_op_part_t *part)
|
void cluster_client_t::handle_op_part(cluster_op_part_t *part)
|
||||||
{
|
{
|
||||||
cluster_op_t *op = part->parent;
|
cluster_op_t *op = part->parent;
|
||||||
part->sent = false;
|
op->inflight_count--;
|
||||||
op->sent_count--;
|
|
||||||
int expected = part->op.req.hdr.opcode == OSD_OP_SYNC ? 0 : part->op.req.rw.len;
|
int expected = part->op.req.hdr.opcode == OSD_OP_SYNC ? 0 : part->op.req.rw.len;
|
||||||
if (part->op.reply.hdr.retval != expected)
|
if (part->op.reply.hdr.retval != expected)
|
||||||
{
|
{
|
||||||
|
@ -763,9 +795,9 @@ void cluster_client_t::handle_op_part(cluster_op_part_t *part)
|
||||||
"%s operation failed on OSD %lu: retval=%ld (expected %d), dropping connection\n",
|
"%s operation failed on OSD %lu: retval=%ld (expected %d), dropping connection\n",
|
||||||
osd_op_names[part->op.req.hdr.opcode], part->osd_num, part->op.reply.hdr.retval, expected
|
osd_op_names[part->op.req.hdr.opcode], part->osd_num, part->op.reply.hdr.retval, expected
|
||||||
);
|
);
|
||||||
msgr.stop_client(part->op.peer_fd);
|
|
||||||
if (part->op.reply.hdr.retval == -EPIPE)
|
if (part->op.reply.hdr.retval == -EPIPE)
|
||||||
{
|
{
|
||||||
|
// Mark op->up_wait = true before stopping the client
|
||||||
op->up_wait = true;
|
op->up_wait = true;
|
||||||
if (!retry_timeout_id)
|
if (!retry_timeout_id)
|
||||||
{
|
{
|
||||||
|
@ -781,15 +813,17 @@ void cluster_client_t::handle_op_part(cluster_op_part_t *part)
|
||||||
// Don't overwrite other errors with -EPIPE
|
// Don't overwrite other errors with -EPIPE
|
||||||
op->retval = part->op.reply.hdr.retval;
|
op->retval = part->op.reply.hdr.retval;
|
||||||
}
|
}
|
||||||
|
msgr.stop_client(part->op.peer_fd);
|
||||||
|
part->flags |= PART_ERROR;
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
// OK
|
// OK
|
||||||
dirty_osds.insert(part->osd_num);
|
dirty_osds.insert(part->osd_num);
|
||||||
part->done = true;
|
part->flags |= PART_DONE;
|
||||||
op->done_count++;
|
op->done_count++;
|
||||||
}
|
}
|
||||||
if (op->sent_count == 0)
|
if (op->inflight_count == 0)
|
||||||
{
|
{
|
||||||
continue_ops();
|
continue_ops();
|
||||||
}
|
}
|
||||||
|
|
|
@ -10,7 +10,8 @@
|
||||||
#define MAX_BLOCK_SIZE 128*1024*1024
|
#define MAX_BLOCK_SIZE 128*1024*1024
|
||||||
#define DEFAULT_DISK_ALIGNMENT 4096
|
#define DEFAULT_DISK_ALIGNMENT 4096
|
||||||
#define DEFAULT_BITMAP_GRANULARITY 4096
|
#define DEFAULT_BITMAP_GRANULARITY 4096
|
||||||
#define DEFAULT_CLIENT_DIRTY_LIMIT 32*1024*1024
|
#define DEFAULT_CLIENT_MAX_DIRTY_BYTES 32*1024*1024
|
||||||
|
#define DEFAULT_CLIENT_MAX_DIRTY_OPS 1024
|
||||||
|
|
||||||
struct cluster_op_t;
|
struct cluster_op_t;
|
||||||
|
|
||||||
|
@ -22,8 +23,7 @@ struct cluster_op_part_t
|
||||||
pg_num_t pg_num;
|
pg_num_t pg_num;
|
||||||
osd_num_t osd_num;
|
osd_num_t osd_num;
|
||||||
osd_op_buf_list_t iov;
|
osd_op_buf_list_t iov;
|
||||||
bool sent;
|
unsigned flags;
|
||||||
bool done;
|
|
||||||
osd_op_t op;
|
osd_op_t op;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -43,7 +43,7 @@ protected:
|
||||||
cluster_op_t *orig_op = NULL;
|
cluster_op_t *orig_op = NULL;
|
||||||
bool needs_reslice = false;
|
bool needs_reslice = false;
|
||||||
bool up_wait = false;
|
bool up_wait = false;
|
||||||
int sent_count = 0, done_count = 0;
|
int inflight_count = 0, done_count = 0;
|
||||||
std::vector<cluster_op_part_t> parts;
|
std::vector<cluster_op_part_t> parts;
|
||||||
friend class cluster_client_t;
|
friend class cluster_client_t;
|
||||||
};
|
};
|
||||||
|
@ -66,7 +66,8 @@ class cluster_client_t
|
||||||
std::map<pool_id_t, uint64_t> pg_counts;
|
std::map<pool_id_t, uint64_t> pg_counts;
|
||||||
bool immediate_commit = false;
|
bool immediate_commit = false;
|
||||||
// FIXME: Implement inmemory_commit mode. Note that it requires to return overlapping reads from memory.
|
// FIXME: Implement inmemory_commit mode. Note that it requires to return overlapping reads from memory.
|
||||||
uint64_t client_dirty_limit = 0;
|
uint64_t client_max_dirty_bytes = 0;
|
||||||
|
uint64_t client_max_dirty_ops = 0;
|
||||||
int log_level;
|
int log_level;
|
||||||
int up_wait_retry_interval = 500; // ms
|
int up_wait_retry_interval = 500; // ms
|
||||||
|
|
||||||
|
@ -76,7 +77,7 @@ class cluster_client_t
|
||||||
std::deque<cluster_op_t*> op_queue;
|
std::deque<cluster_op_t*> op_queue;
|
||||||
std::map<object_id, cluster_buffer_t> dirty_buffers;
|
std::map<object_id, cluster_buffer_t> dirty_buffers;
|
||||||
std::set<osd_num_t> dirty_osds;
|
std::set<osd_num_t> dirty_osds;
|
||||||
uint64_t dirty_bytes = 0;
|
uint64_t dirty_bytes = 0, dirty_ops = 0;
|
||||||
|
|
||||||
bool pgs_loaded = false;
|
bool pgs_loaded = false;
|
||||||
ring_consumer_t consumer;
|
ring_consumer_t consumer;
|
||||||
|
|
|
@ -180,23 +180,12 @@ void osd_messenger_t::try_connect_peer_addr(osd_num_t peer_osd, const char *peer
|
||||||
on_connect_peer(peer_osd, -errno);
|
on_connect_peer(peer_osd, -errno);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
int timeout_id = -1;
|
|
||||||
if (peer_connect_timeout > 0)
|
|
||||||
{
|
|
||||||
timeout_id = tfd->set_timer(1000*peer_connect_timeout, false, [this, peer_fd](int timer_id)
|
|
||||||
{
|
|
||||||
osd_num_t peer_osd = clients.at(peer_fd)->osd_num;
|
|
||||||
stop_client(peer_fd, true);
|
|
||||||
on_connect_peer(peer_osd, -EIO);
|
|
||||||
return;
|
|
||||||
});
|
|
||||||
}
|
|
||||||
clients[peer_fd] = new osd_client_t((osd_client_t){
|
clients[peer_fd] = new osd_client_t((osd_client_t){
|
||||||
.peer_addr = addr,
|
.peer_addr = addr,
|
||||||
.peer_port = peer_port,
|
.peer_port = peer_port,
|
||||||
.peer_fd = peer_fd,
|
.peer_fd = peer_fd,
|
||||||
.peer_state = PEER_CONNECTING,
|
.peer_state = PEER_CONNECTING,
|
||||||
.connect_timeout_id = timeout_id,
|
.connect_timeout_id = -1,
|
||||||
.osd_num = peer_osd,
|
.osd_num = peer_osd,
|
||||||
.in_buf = malloc_or_die(receive_buffer_size),
|
.in_buf = malloc_or_die(receive_buffer_size),
|
||||||
});
|
});
|
||||||
|
@ -205,6 +194,16 @@ void osd_messenger_t::try_connect_peer_addr(osd_num_t peer_osd, const char *peer
|
||||||
// Either OUT (connected) or HUP
|
// Either OUT (connected) or HUP
|
||||||
handle_connect_epoll(peer_fd);
|
handle_connect_epoll(peer_fd);
|
||||||
});
|
});
|
||||||
|
if (peer_connect_timeout > 0)
|
||||||
|
{
|
||||||
|
clients[peer_fd]->connect_timeout_id = tfd->set_timer(1000*peer_connect_timeout, false, [this, peer_fd](int timer_id)
|
||||||
|
{
|
||||||
|
osd_num_t peer_osd = clients.at(peer_fd)->osd_num;
|
||||||
|
stop_client(peer_fd, true);
|
||||||
|
on_connect_peer(peer_osd, -EIO);
|
||||||
|
return;
|
||||||
|
});
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void osd_messenger_t::handle_connect_epoll(int peer_fd)
|
void osd_messenger_t::handle_connect_epoll(int peer_fd)
|
||||||
|
|
|
@ -8,12 +8,19 @@
|
||||||
|
|
||||||
void osd_messenger_t::cancel_osd_ops(osd_client_t *cl)
|
void osd_messenger_t::cancel_osd_ops(osd_client_t *cl)
|
||||||
{
|
{
|
||||||
|
std::vector<osd_op_t*> cancel_ops;
|
||||||
|
cancel_ops.resize(cl->sent_ops.size());
|
||||||
|
int i = 0;
|
||||||
for (auto p: cl->sent_ops)
|
for (auto p: cl->sent_ops)
|
||||||
{
|
{
|
||||||
cancel_op(p.second);
|
cancel_ops[i++] = p.second;
|
||||||
}
|
}
|
||||||
cl->sent_ops.clear();
|
cl->sent_ops.clear();
|
||||||
cl->outbox.clear();
|
cl->outbox.clear();
|
||||||
|
for (auto op: cancel_ops)
|
||||||
|
{
|
||||||
|
cancel_op(op);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void osd_messenger_t::cancel_op(osd_op_t *op)
|
void osd_messenger_t::cancel_op(osd_op_t *op)
|
||||||
|
|
|
@ -124,6 +124,15 @@ void pretend_disconnected(cluster_client_t *cli, osd_num_t osd_num)
|
||||||
cli->msgr.stop_client(cli->msgr.osd_peer_fds.at(osd_num));
|
cli->msgr.stop_client(cli->msgr.osd_peer_fds.at(osd_num));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void check_disconnected(cluster_client_t *cli, osd_num_t osd_num)
|
||||||
|
{
|
||||||
|
if (cli->msgr.osd_peer_fds.find(osd_num) != cli->msgr.osd_peer_fds.end())
|
||||||
|
{
|
||||||
|
printf("OSD %lu not disconnected as it ought to be\n", osd_num);
|
||||||
|
assert(0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
void check_op_count(cluster_client_t *cli, osd_num_t osd_num, int ops)
|
void check_op_count(cluster_client_t *cli, osd_num_t osd_num, int ops)
|
||||||
{
|
{
|
||||||
int peer_fd = cli->msgr.osd_peer_fds.at(osd_num);
|
int peer_fd = cli->msgr.osd_peer_fds.at(osd_num);
|
||||||
|
@ -152,20 +161,20 @@ osd_op_t *find_op(cluster_client_t *cli, osd_num_t osd_num, uint64_t opcode, uin
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
void pretend_op_completed(cluster_client_t *cli, osd_op_t *op, int retval)
|
void pretend_op_completed(cluster_client_t *cli, osd_op_t *op, int64_t retval)
|
||||||
{
|
{
|
||||||
assert(op);
|
assert(op);
|
||||||
printf("Pretend completed %s %lx+%x\n", op->req.hdr.opcode == OSD_OP_SYNC
|
printf("Pretend completed %s %lx+%x\n", op->req.hdr.opcode == OSD_OP_SYNC
|
||||||
? "sync" : (op->req.hdr.opcode == OSD_OP_WRITE ? "write" : "read"), op->req.rw.offset, op->req.rw.len);
|
? "sync" : (op->req.hdr.opcode == OSD_OP_WRITE ? "write" : "read"), op->req.rw.offset, op->req.rw.len);
|
||||||
uint64_t op_id = op->req.hdr.id;
|
uint64_t op_id = op->req.hdr.id;
|
||||||
int peer_fd = op->peer_fd;
|
int peer_fd = op->peer_fd;
|
||||||
|
cli->msgr.clients[peer_fd]->sent_ops.erase(op_id);
|
||||||
op->reply.hdr.magic = SECONDARY_OSD_REPLY_MAGIC;
|
op->reply.hdr.magic = SECONDARY_OSD_REPLY_MAGIC;
|
||||||
op->reply.hdr.id = op->req.hdr.id;
|
op->reply.hdr.id = op->req.hdr.id;
|
||||||
op->reply.hdr.opcode = op->req.hdr.opcode;
|
op->reply.hdr.opcode = op->req.hdr.opcode;
|
||||||
op->reply.hdr.retval = retval < 0 ? retval : (op->req.hdr.opcode == OSD_OP_SYNC ? 0 : op->req.rw.len);
|
op->reply.hdr.retval = retval < 0 ? retval : (op->req.hdr.opcode == OSD_OP_SYNC ? 0 : op->req.rw.len);
|
||||||
// Copy lambda to be unaffected by `delete op`
|
// Copy lambda to be unaffected by `delete op`
|
||||||
std::function<void(osd_op_t*)>(op->callback)(op);
|
std::function<void(osd_op_t*)>(op->callback)(op);
|
||||||
cli->msgr.clients[peer_fd]->sent_ops.erase(op_id);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void test1()
|
void test1()
|
||||||
|
@ -177,6 +186,7 @@ void test1()
|
||||||
int *r1 = test_write(cli, 0, 4096, 0x55);
|
int *r1 = test_write(cli, 0, 4096, 0x55);
|
||||||
configure_single_pg_pool(cli);
|
configure_single_pg_pool(cli);
|
||||||
pretend_connected(cli, 1);
|
pretend_connected(cli, 1);
|
||||||
|
cli->continue_ops(true);
|
||||||
can_complete(r1);
|
can_complete(r1);
|
||||||
check_op_count(cli, 1, 1);
|
check_op_count(cli, 1, 1);
|
||||||
pretend_op_completed(cli, find_op(cli, 1, OSD_OP_WRITE, 0, 4096), 0);
|
pretend_op_completed(cli, find_op(cli, 1, OSD_OP_WRITE, 0, 4096), 0);
|
||||||
|
@ -184,6 +194,8 @@ void test1()
|
||||||
pretend_disconnected(cli, 1);
|
pretend_disconnected(cli, 1);
|
||||||
int *r2 = test_sync(cli);
|
int *r2 = test_sync(cli);
|
||||||
pretend_connected(cli, 1);
|
pretend_connected(cli, 1);
|
||||||
|
check_op_count(cli, 1, 0);
|
||||||
|
cli->continue_ops(true);
|
||||||
check_op_count(cli, 1, 1);
|
check_op_count(cli, 1, 1);
|
||||||
pretend_op_completed(cli, find_op(cli, 1, OSD_OP_WRITE, 0, 4096), 0);
|
pretend_op_completed(cli, find_op(cli, 1, OSD_OP_WRITE, 0, 4096), 0);
|
||||||
check_op_count(cli, 1, 1);
|
check_op_count(cli, 1, 1);
|
||||||
|
@ -226,8 +238,8 @@ void test1()
|
||||||
r1 = test_write(cli, 0x10000, 0x4000, 0x58);
|
r1 = test_write(cli, 0x10000, 0x4000, 0x58);
|
||||||
|
|
||||||
pretend_disconnected(cli, 1);
|
pretend_disconnected(cli, 1);
|
||||||
cli->continue_ops(true);
|
|
||||||
pretend_connected(cli, 1);
|
pretend_connected(cli, 1);
|
||||||
|
cli->continue_ops(true);
|
||||||
|
|
||||||
// Check replay
|
// Check replay
|
||||||
{
|
{
|
||||||
|
@ -260,8 +272,11 @@ void test1()
|
||||||
assert(offset == op->req.rw.offset+op->req.rw.len);
|
assert(offset == op->req.rw.offset+op->req.rw.len);
|
||||||
replay_ops.push_back(op);
|
replay_ops.push_back(op);
|
||||||
}
|
}
|
||||||
assert(replay_start == 0);
|
if (replay_start != 0 || replay_end != 0x14000)
|
||||||
assert(replay_end == 0x14000);
|
{
|
||||||
|
printf("Write replay: range mismatch: %lx-%lx\n", replay_start, replay_end);
|
||||||
|
assert(0);
|
||||||
|
}
|
||||||
for (auto op: replay_ops)
|
for (auto op: replay_ops)
|
||||||
{
|
{
|
||||||
pretend_op_completed(cli, op, 0);
|
pretend_op_completed(cli, op, 0);
|
||||||
|
@ -273,6 +288,28 @@ void test1()
|
||||||
pretend_op_completed(cli, find_op(cli, 1, OSD_OP_WRITE, 0x10000, 0x4000), 0);
|
pretend_op_completed(cli, find_op(cli, 1, OSD_OP_WRITE, 0x10000, 0x4000), 0);
|
||||||
check_completed(r1);
|
check_completed(r1);
|
||||||
check_op_count(cli, 1, 0);
|
check_op_count(cli, 1, 0);
|
||||||
|
|
||||||
|
// Check sync
|
||||||
|
r2 = test_sync(cli);
|
||||||
|
can_complete(r2);
|
||||||
|
pretend_op_completed(cli, find_op(cli, 1, OSD_OP_SYNC, 0, 0), 0);
|
||||||
|
check_completed(r2);
|
||||||
|
|
||||||
|
// Check disconnect during write
|
||||||
|
r1 = test_write(cli, 0, 4096, 0x59);
|
||||||
|
check_op_count(cli, 1, 1);
|
||||||
|
pretend_op_completed(cli, find_op(cli, 1, OSD_OP_WRITE, 0, 0x1000), -EPIPE);
|
||||||
|
check_disconnected(cli, 1);
|
||||||
|
pretend_connected(cli, 1);
|
||||||
|
check_op_count(cli, 1, 0);
|
||||||
|
cli->continue_ops(true);
|
||||||
|
check_op_count(cli, 1, 1);
|
||||||
|
pretend_op_completed(cli, find_op(cli, 1, OSD_OP_WRITE, 0, 0x1000), 0);
|
||||||
|
check_op_count(cli, 1, 1);
|
||||||
|
can_complete(r1);
|
||||||
|
pretend_op_completed(cli, find_op(cli, 1, OSD_OP_WRITE, 0, 0x1000), 0);
|
||||||
|
check_completed(r1);
|
||||||
|
|
||||||
// Free client
|
// Free client
|
||||||
delete cli;
|
delete cli;
|
||||||
delete tfd;
|
delete tfd;
|
||||||
|
|
|
@ -121,7 +121,7 @@ again:
|
||||||
exp.it_value.tv_sec--;
|
exp.it_value.tv_sec--;
|
||||||
exp.it_value.tv_nsec += 1000000000;
|
exp.it_value.tv_nsec += 1000000000;
|
||||||
}
|
}
|
||||||
if (exp.it_value.tv_sec < 0 || !exp.it_value.tv_sec && !exp.it_value.tv_nsec)
|
if (exp.it_value.tv_sec < 0 || exp.it_value.tv_sec == 0 && exp.it_value.tv_nsec <= 0)
|
||||||
{
|
{
|
||||||
// It already happened
|
// It already happened
|
||||||
trigger_nearest();
|
trigger_nearest();
|
||||||
|
@ -159,6 +159,6 @@ void timerfd_manager_t::trigger_nearest()
|
||||||
{
|
{
|
||||||
timers.erase(timers.begin()+nearest, timers.begin()+nearest+1);
|
timers.erase(timers.begin()+nearest, timers.begin()+nearest+1);
|
||||||
}
|
}
|
||||||
cb(nearest_id);
|
|
||||||
nearest = -1;
|
nearest = -1;
|
||||||
|
cb(nearest_id);
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue