Makefile Normal file
View File

@ -0,0 +1,90 @@
BLOCKSTORE_OBJS := allocator.o blockstore.o blockstore_impl.o blockstore_init.o blockstore_open.o blockstore_journal.o blockstore_read.o \
blockstore_write.o blockstore_sync.o blockstore_stable.o blockstore_rollback.o blockstore_flush.o crc32c.o ringloop.o
# -fsanitize=address
CXXFLAGS := -g -O3 -Wall -Wno-sign-compare -Wno-comment -Wno-parentheses -Wno-pointer-arith -fPIC -fdiagnostics-color=always
all: $(BLOCKSTORE_OBJS) osd stub_osd stub_bench osd_test dump_journal
rm -f *.o
crc32c.o: crc32c.c
g++ $(CXXFLAGS) -c -o $@ $<
json11.o: json11/json11.cpp
g++ $(CXXFLAGS) -c -o json11.o json11/json11.cpp
allocator.o: allocator.cpp allocator.h
g++ $(CXXFLAGS) -c -o $@ $<
ringloop.o: ringloop.cpp ringloop.h
g++ $(CXXFLAGS) -c -o $@ $<
timerfd_interval.o: timerfd_interval.cpp timerfd_interval.h ringloop.h
g++ $(CXXFLAGS) -c -o $@ $<
timerfd_manager.o: timerfd_manager.cpp timerfd_manager.h ringloop.h
g++ $(CXXFLAGS) -c -o $@ $<
%.o: %.cpp allocator.h blockstore_flush.h blockstore.h blockstore_impl.h blockstore_init.h blockstore_journal.h crc32c.h ringloop.h object_id.h
g++ $(CXXFLAGS) -c -o $@ $<
dump_journal: dump_journal.cpp crc32c.o blockstore_journal.h
g++ $(CXXFLAGS) -o $@ $< crc32c.o $(BLOCKSTORE_OBJS)
g++ $(CXXFLAGS) -o -shared $(BLOCKSTORE_OBJS) -ltcmalloc_minimal -luring ./ fio_engine.cpp json11.o
g++ $(CXXFLAGS) -shared -o fio_engine.cpp json11.o ./ -ltcmalloc_minimal -luring
OSD_OBJS := osd.o osd_secondary.o osd_receive.o osd_send.o osd_peering.o osd_flush.o osd_peering_pg.o \
osd_primary.o osd_primary_subops.o etcd_state_client.o cluster_client.o osd_cluster.o http_client.o pg_states.o \
osd_rmw.o json11.o base64.o timerfd_manager.o
base64.o: base64.cpp base64.h
g++ $(CXXFLAGS) -c -o $@ $<
osd_secondary.o: osd_secondary.cpp osd.h osd_ops.h ringloop.h
g++ $(CXXFLAGS) -c -o $@ $<
osd_receive.o: osd_receive.cpp osd.h osd_ops.h ringloop.h
g++ $(CXXFLAGS) -c -o $@ $<
osd_send.o: osd_send.cpp osd.h osd_ops.h ringloop.h
g++ $(CXXFLAGS) -c -o $@ $<
osd_peering.o: osd_peering.cpp osd.h osd_ops.h osd_peering_pg.h ringloop.h
g++ $(CXXFLAGS) -c -o $@ $<
osd_cluster.o: osd_cluster.cpp osd.h osd_ops.h ringloop.h
g++ $(CXXFLAGS) -c -o $@ $<
http_client.o: http_client.cpp http_client.h
g++ $(CXXFLAGS) -c -o $@ $<
etcd_state_client.o: etcd_state_client.cpp etcd_state_client.h http_client.h pg_states.h
g++ $(CXXFLAGS) -c -o $@ $<
cluster_client.o: cluster_client.cpp cluster_client.h osd_ops.h timerfd_manager.h ringloop.h
g++ $(CXXFLAGS) -c -o $@ $<
osd_flush.o: osd_flush.cpp osd.h osd_ops.h osd_peering_pg.h ringloop.h
g++ $(CXXFLAGS) -c -o $@ $<
osd_peering_pg.o: osd_peering_pg.cpp object_id.h osd_peering_pg.h pg_states.h
g++ $(CXXFLAGS) -c -o $@ $<
pg_states.o: pg_states.cpp pg_states.h
g++ $(CXXFLAGS) -c -o $@ $<
osd_rmw.o: osd_rmw.cpp osd_rmw.h xor.h
g++ $(CXXFLAGS) -c -o $@ $<
osd_rmw_test: osd_rmw_test.cpp osd_rmw.cpp osd_rmw.h xor.h
g++ $(CXXFLAGS) -o $@ $<
osd_primary.o: osd_primary.cpp osd_primary.h osd_rmw.h osd.h osd_ops.h osd_peering_pg.h xor.h ringloop.h
g++ $(CXXFLAGS) -c -o $@ $<
osd_primary_subops.o: osd_primary_subops.cpp osd_primary.h osd_rmw.h osd.h osd_ops.h osd_peering_pg.h xor.h ringloop.h
g++ $(CXXFLAGS) -c -o $@ $<
osd.o: osd.cpp osd.h http_client.h osd_ops.h osd_peering_pg.h ringloop.h
g++ $(CXXFLAGS) -c -o $@ $<
osd: ./ osd_main.cpp osd.h osd_ops.h $(OSD_OBJS)
g++ $(CXXFLAGS) -o osd osd_main.cpp $(OSD_OBJS) ./ -ltcmalloc_minimal -luring
stub_osd: stub_osd.cpp osd_ops.h rw_blocking.o
g++ $(CXXFLAGS) -o stub_osd stub_osd.cpp rw_blocking.o -ltcmalloc_minimal
stub_bench: stub_bench.cpp osd_ops.h rw_blocking.o
g++ $(CXXFLAGS) -o stub_bench stub_bench.cpp rw_blocking.o -ltcmalloc_minimal
rw_blocking.o: rw_blocking.cpp rw_blocking.h
g++ $(CXXFLAGS) -c -o $@ $<
osd_test: osd_test.cpp osd_ops.h rw_blocking.o
g++ $(CXXFLAGS) -o osd_test osd_test.cpp rw_blocking.o -ltcmalloc_minimal
osd_peering_pg_test: osd_peering_pg_test.cpp osd_peering_pg.o
g++ $(CXXFLAGS) -o $@ $< osd_peering_pg.o -ltcmalloc_minimal fio_sec_osd.cpp osd_ops.h rw_blocking.o
g++ $(CXXFLAGS) -ltcmalloc_minimal -shared -o fio_sec_osd.cpp rw_blocking.o -luring
test_blockstore: ./ test_blockstore.cpp timerfd_interval.o
g++ $(CXXFLAGS) -o test_blockstore test_blockstore.cpp timerfd_interval.o ./ -ltcmalloc_minimal -luring
test: test.cpp osd_peering_pg.o
g++ $(CXXFLAGS) -o test test.cpp osd_peering_pg.o -luring -lm
test_allocator: test_allocator.cpp allocator.o
g++ $(CXXFLAGS) -o test_allocator test_allocator.cpp allocator.o

View File

@ -1,6 +1,3 @@
#include <stdexcept> #include <stdexcept>
#include "allocator.h" #include "allocator.h"
@ -13,19 +10,19 @@ allocator::allocator(uint64_t blocks)
{ {
throw std::invalid_argument("blocks"); throw std::invalid_argument("blocks");
} }
uint64_t p2 = 1; uint64_t p2 = 1, total = 1;
total = 0;
while (p2 * 64 < blocks) while (p2 * 64 < blocks)
{ {
total += p2;
p2 = p2 * 64; p2 = p2 * 64;
total += p2;
} }
total -= p2;
total += (blocks+63) / 64; total += (blocks+63) / 64;
mask = new uint64_t[total]; mask = new uint64_t[2 + total];
size = free = blocks; size = free = blocks;
last_one_mask = (blocks % 64) == 0 last_one_mask = (blocks % 64) == 0
: ((1l << (blocks % 64)) - 1); : ~(UINT64_MAX << (64 - blocks % 64));
for (uint64_t i = 0; i < total; i++) for (uint64_t i = 0; i < total; i++)
{ {
mask[i] = 0; mask[i] = 0;
@ -99,10 +96,6 @@ uint64_t allocator::find_free()
uint64_t p2 = 1, offset = 0, addr = 0, f, i; uint64_t p2 = 1, offset = 0, addr = 0, f, i;
while (p2 < size) while (p2 < size)
{ {
if (offset+addr >= total)
return UINT64_MAX;
uint64_t m = mask[offset + addr]; uint64_t m = mask[offset + addr];
for (i = 0, f = 1; i < 64; i++, f <<= 1) for (i = 0, f = 1; i < 64; i++, f <<= 1)
{ {
@ -117,6 +110,11 @@ uint64_t allocator::find_free()
return UINT64_MAX; return UINT64_MAX;
} }
addr = (addr * 64) | i; addr = (addr * 64) | i;
if (addr >= size)
// No space
return UINT64_MAX;
offset += p2; offset += p2;
p2 = p2 * 64; p2 = p2 * 64;
} }
@ -127,35 +125,3 @@ uint64_t allocator::get_free_count()
{ {
return free; return free;
} }
void bitmap_set(void *bitmap, uint64_t start, uint64_t len, uint64_t bitmap_granularity)
if (start == 0)
if (len == 32*bitmap_granularity)
*((uint32_t*)bitmap) = UINT32_MAX;
else if (len == 64*bitmap_granularity)
*((uint64_t*)bitmap) = UINT64_MAX;
unsigned bit_start = start / bitmap_granularity;
unsigned bit_end = ((start + len) + bitmap_granularity - 1) / bitmap_granularity;
while (bit_start < bit_end)
if (!(bit_start & 7) && bit_end >= bit_start+8)
((uint8_t*)bitmap)[bit_start / 8] = UINT8_MAX;
bit_start += 8;
((uint8_t*)bitmap)[bit_start / 8] |= 1 << (bit_start % 8);

View File

#pragma once #pragma once
#include <stdint.h> #include <stdint.h>
@ -8,7 +5,6 @@
// Hierarchical bitmap allocator // Hierarchical bitmap allocator
class allocator class allocator
{ {
uint64_t total;
uint64_t size; uint64_t size;
uint64_t free; uint64_t free;
uint64_t last_one_mask; uint64_t last_one_mask;
@ -20,5 +16,3 @@ public:
uint64_t find_free(); uint64_t find_free();
uint64_t get_free_count(); uint64_t get_free_count();
}; };
void bitmap_set(void *bitmap, uint64_t start, uint64_t len, uint64_t bitmap_granularity);

View File

#include "base64.h" #include "base64.h"
std::string base64_encode(const std::string &in) std::string base64_encode(const std::string &in)

View File

#pragma once #pragma once
#include <string> #include <string>

View File

#include "blockstore_impl.h" #include "blockstore_impl.h"
blockstore_t::blockstore_t(blockstore_config_t & config, ring_loop_t *ringloop) blockstore_t::blockstore_t(blockstore_config_t & config, ring_loop_t *ringloop)
@ -35,7 +32,12 @@ bool blockstore_t::is_safe_to_stop()
void blockstore_t::enqueue_op(blockstore_op_t *op) void blockstore_t::enqueue_op(blockstore_op_t *op)
{ {
impl->enqueue_op(op); impl->enqueue_op(op, false);
void blockstore_t::enqueue_op_first(blockstore_op_t *op)
impl->enqueue_op(op, true);
} }
std::unordered_map<object_id, uint64_t> & blockstore_t::get_unstable_writes() std::unordered_map<object_id, uint64_t> & blockstore_t::get_unstable_writes()
@ -43,11 +45,6 @@ std::unordered_map<object_id, uint64_t> & blockstore_t::get_unstable_writes()
return impl->unstable_writes; return impl->unstable_writes;
} }
std::map<uint64_t, uint64_t> & blockstore_t::get_inode_space_stats()
return impl->inode_space_stats;
uint32_t blockstore_t::get_block_size() uint32_t blockstore_t::get_block_size()
{ {
return impl->get_block_size(); return impl->get_block_size();
@ -63,7 +60,7 @@ uint64_t blockstore_t::get_free_block_count()
return impl->get_free_block_count(); return impl->get_free_block_count();
} }
uint32_t blockstore_t::get_bitmap_granularity() uint32_t blockstore_t::get_disk_alignment()
{ {
return impl->get_bitmap_granularity(); return impl->get_disk_alignment();
} }

// License: VNPL-1.1 (see for details)
#pragma once #pragma once
@ -9,7 +6,6 @@
#include <stdint.h> #include <stdint.h>
#include <string>
#include <map> #include <map>
#include <unordered_map> #include <unordered_map>
#include <functional> #include <functional>
@ -27,19 +23,17 @@
#define DEFAULT_ORDER 17 #define DEFAULT_ORDER 17
#define MIN_BLOCK_SIZE 4*1024 #define MIN_BLOCK_SIZE 4*1024
#define MAX_BLOCK_SIZE 128*1024*1024 #define MAX_BLOCK_SIZE 128*1024*1024
#define BS_OP_MIN 1 #define BS_OP_MIN 1
#define BS_OP_READ 1 #define BS_OP_READ 1
#define BS_OP_WRITE 2 #define BS_OP_WRITE 2
#define BS_OP_WRITE_STABLE 3 #define BS_OP_SYNC 3
#define BS_OP_SYNC 4 #define BS_OP_STABLE 4
#define BS_OP_STABLE 5 #define BS_OP_DELETE 5
#define BS_OP_DELETE 6 #define BS_OP_LIST 6
#define BS_OP_LIST 7 #define BS_OP_ROLLBACK 7
#define BS_OP_ROLLBACK 8 #define BS_OP_SYNC_STAB_ALL 8
#define BS_OP_SYNC_STAB_ALL 9 #define BS_OP_MAX 8
#define BS_OP_MAX 9
@ -47,9 +41,9 @@
Blockstore opcode documentation: Blockstore opcode documentation:
Read or write object data. WRITE_STABLE writes a version that doesn't require marking as stable. Read or write object data.
Input: Input:
- oid = requested object - oid = requested object
@ -65,8 +59,6 @@ Input:
- offset, len = offset and length within object. length may be zero, in that case - offset, len = offset and length within object. length may be zero, in that case
read operation only returns the version / write operation only bumps the version read operation only returns the version / write operation only bumps the version
- buf = pre-allocated buffer for data (read) / with data (write). may be NULL if len == 0. - buf = pre-allocated buffer for data (read) / with data (write). may be NULL if len == 0.
- bitmap = pointer to the new 'external' object bitmap data. Its part which is respective to the
write request is copied into the metadata area bitwise and stored there.
Output: Output:
- retval = number of bytes actually read/written or negative error number (-EINVAL or -ENOSPC) - retval = number of bytes actually read/written or negative error number (-EINVAL or -ENOSPC)
@ -121,8 +113,6 @@ Input:
- oid.stripe = PG alignment - oid.stripe = PG alignment
- len = PG count or 0 to list all objects - len = PG count or 0 to list all objects
- offset = PG number - offset = PG number
- oid.inode = min inode number or 0 to list all inodes
- version = max inode number or 0 to list all inodes
Output: Output:
- retval = total obj_ver_id count - retval = total obj_ver_id count
@ -144,7 +134,6 @@ struct blockstore_op_t
uint32_t offset; uint32_t offset;
uint32_t len; uint32_t len;
void *buf; void *buf;
void *bitmap;
int retval; int retval;
uint8_t private_data[BS_OP_PRIVATE_DATA_SIZE]; uint8_t private_data[BS_OP_PRIVATE_DATA_SIZE];
@ -179,16 +168,17 @@ public:
// Submission // Submission
void enqueue_op(blockstore_op_t *op); void enqueue_op(blockstore_op_t *op);
// Insert operation into the beginning of the queue
// Intended for the OSD syncer "thread" to be able to stabilize something when the journal is full
void enqueue_op_first(blockstore_op_t *op);
// Unstable writes are added here (map of object_id -> version) // Unstable writes are added here (map of object_id -> version)
std::unordered_map<object_id, uint64_t> & get_unstable_writes(); std::unordered_map<object_id, uint64_t> & get_unstable_writes();
// Get per-inode space usage statistics
std::map<uint64_t, uint64_t> & get_inode_space_stats();
// FIXME rename to object_size // FIXME rename to object_size
uint32_t get_block_size(); uint32_t get_block_size();
uint64_t get_block_count(); uint64_t get_block_count();
uint64_t get_free_block_count(); uint64_t get_free_block_count();
uint32_t get_bitmap_granularity(); uint32_t get_disk_alignment();
}; };

View File

@ -1,24 +1,16 @@
// Copyright (c) Vitaliy Filippov, 2019+
// License: VNPL-1.1 (see for details)
#include "blockstore_impl.h" #include "blockstore_impl.h"
journal_flusher_t::journal_flusher_t(int flusher_count, blockstore_impl_t *bs) journal_flusher_t::journal_flusher_t(int flusher_count, blockstore_impl_t *bs)
{ {
this->bs = bs; this->bs = bs;
this->flusher_count = flusher_count; this->flusher_count = flusher_count;
this->cur_flusher_count = 1;
this->target_flusher_count = 1;
dequeuing = false; dequeuing = false;
trimming = false;
active_flushers = 0; active_flushers = 0;
syncing_flushers = 0; syncing_flushers = 0;
// FIXME: allow to configure flusher_start_threshold and journal_trim_interval sync_threshold = bs->journal_block_size / sizeof(journal_entry_stable);
flusher_start_threshold = bs->journal_block_size / sizeof(journal_entry_stable); journal_trim_interval = sync_threshold;
journal_trim_interval = 512;
journal_trim_counter = 0; journal_trim_counter = 0;
trim_wanted = 0; journal_superblock = bs->journal.inmemory ? bs->journal.buffer : memalign(MEM_ALIGNMENT, bs->journal_block_size);
journal_superblock = bs->journal.inmemory ? bs->journal.buffer : memalign_or_die(MEM_ALIGNMENT, bs->journal_block_size);
co = new journal_flusher_co[flusher_count]; co = new journal_flusher_co[flusher_count];
for (int i = 0; i < flusher_count; i++) for (int i = 0; i < flusher_count; i++)
{ {
@ -70,31 +62,14 @@ bool journal_flusher_t::is_active()
void journal_flusher_t::loop() void journal_flusher_t::loop()
{ {
target_flusher_count = bs->write_iodepth*2; for (int i = 0; (active_flushers > 0 || dequeuing) && i < flusher_count; i++)
if (target_flusher_count <= 0)
target_flusher_count = 1;
else if (target_flusher_count > flusher_count)
target_flusher_count = flusher_count;
if (target_flusher_count > cur_flusher_count)
cur_flusher_count = target_flusher_count;
else if (target_flusher_count < cur_flusher_count)
{ {
while (target_flusher_count < cur_flusher_count)
if (co[cur_flusher_count-1].wait_state)
for (int i = 0; (active_flushers > 0 || dequeuing) && i < cur_flusher_count; i++)
co[i].loop(); co[i].loop();
} }
void journal_flusher_t::enqueue_flush(obj_ver_id ov) void journal_flusher_t::enqueue_flush(obj_ver_id ov)
{ {
printf("enqueue_flush %lx:%lx v%lu\n", ov.oid.inode, ov.oid.stripe, ov.version);
auto it = flush_versions.find(ov.oid); auto it = flush_versions.find(ov.oid);
if (it != flush_versions.end()) if (it != flush_versions.end())
{ {
@ -106,18 +81,15 @@ void journal_flusher_t::enqueue_flush(obj_ver_id ov)
flush_versions[ov.oid] = ov.version; flush_versions[ov.oid] = ov.version;
flush_queue.push_back(ov.oid); flush_queue.push_back(ov.oid);
} }
if (!dequeuing && (flush_queue.size() >= flusher_start_threshold || trim_wanted > 0)) if (!dequeuing && flush_queue.size() >= sync_threshold)
{ {
dequeuing = true; dequeuing = true;
bs->ringloop->wakeup(); bs->ringloop->wakeup();
} }
} }
void journal_flusher_t::unshift_flush(obj_ver_id ov, bool force) void journal_flusher_t::unshift_flush(obj_ver_id ov)
{ {
printf("unshift_flush %lx:%lx v%lu\n", ov.oid.inode, ov.oid.stripe, ov.version);
auto it = flush_versions.find(ov.oid); auto it = flush_versions.find(ov.oid);
if (it != flush_versions.end()) if (it != flush_versions.end())
{ {
@ -127,62 +99,28 @@ void journal_flusher_t::unshift_flush(obj_ver_id ov, bool force)
else else
{ {
flush_versions[ov.oid] = ov.version; flush_versions[ov.oid] = ov.version;
if (!force)
if (force)
flush_queue.push_front(ov.oid); flush_queue.push_front(ov.oid);
if (force || !dequeuing && (flush_queue.size() >= flusher_start_threshold || trim_wanted > 0)) }
if (!dequeuing && flush_queue.size() >= sync_threshold)
{ {
dequeuing = true; dequeuing = true;
bs->ringloop->wakeup(); bs->ringloop->wakeup();
} }
} }
void journal_flusher_t::remove_flush(object_id oid) void journal_flusher_t::force_start()
printf("undo_flush %lx:%lx\n", oid.inode, oid.stripe);
auto v_it = flush_versions.find(oid);
if (v_it != flush_versions.end())
for (auto q_it = flush_queue.begin(); q_it != flush_queue.end(); q_it++)
if (*q_it == oid)
void journal_flusher_t::request_trim()
{ {
dequeuing = true; dequeuing = true;
bs->ringloop->wakeup(); bs->ringloop->wakeup();
} }
void journal_flusher_t::mark_trim_possible()
if (trim_wanted > 0)
dequeuing = true;
void journal_flusher_t::release_trim()
#define await_sqe(label) \ #define await_sqe(label) \
resume_##label:\ resume_##label:\
timespec now;\
clock_gettime(CLOCK_REALTIME, &now);\
printf("get_sqe %s %d %ld.%06ld\n", __FILE__, __LINE__, now.tv_sec, now.tv_nsec/1000);\
sqe = bs->get_sqe();\ sqe = bs->get_sqe();\
if (!sqe)\ if (!sqe)\
{\ {\
@ -230,22 +168,9 @@ bool journal_flusher_co::loop()
goto resume_17; goto resume_17;
else if (wait_state == 18) else if (wait_state == 18)
goto resume_18; goto resume_18;
else if (wait_state == 19)
goto resume_19;
else if (wait_state == 20)
goto resume_20;
else if (wait_state == 21)
goto resume_21;
resume_0: resume_0:
if (!flusher->flush_queue.size() || !flusher->dequeuing) if (!flusher->flush_queue.size() || !flusher->dequeuing)
{ {
if (flusher->trim_wanted > 0 && flusher->journal_trim_counter > 0)
// Attempt forced trim
goto trim_journal;
flusher->dequeuing = false; flusher->dequeuing = false;
wait_state = 0; wait_state = 0;
return true; return true;
@ -257,11 +182,21 @@ stop_flusher:
dirty_end = bs->dirty_db.find(cur); dirty_end = bs->dirty_db.find(cur);
if (dirty_end != bs->dirty_db.end()) if (dirty_end != bs->dirty_db.end())
{ {
if (dirty_end->second.journal_sector >= bs->journal.dirty_start &&
(bs->journal.dirty_start >= bs->journal.used_start ||
dirty_end->second.journal_sector < bs->journal.used_start))
// We can't flush journal sectors that are still written to
flusher->dequeuing = false;
wait_state = 0;
return true;
repeat_it = flusher->sync_to_repeat.find(cur.oid); repeat_it = flusher->sync_to_repeat.find(cur.oid);
if (repeat_it != flusher->sync_to_repeat.end()) if (repeat_it != flusher->sync_to_repeat.end())
{ {
printf("Postpone %lx:%lx v%lu\n", cur.oid.inode, cur.oid.stripe, cur.version); printf("Postpone %lu:%lu v%lu\n", cur.oid.inode, cur.oid.stripe, cur.version);
#endif #endif
// We don't flush different parts of history of the same object in parallel // We don't flush different parts of history of the same object in parallel
// So we check if someone is already flushing this object // So we check if someone is already flushing this object
@ -274,110 +209,42 @@ stop_flusher:
} }
else else
flusher->sync_to_repeat[cur.oid] = 0; flusher->sync_to_repeat[cur.oid] = 0;
if (dirty_end->second.journal_sector >= bs->journal.dirty_start &&
(bs->journal.dirty_start >= bs->journal.used_start ||
dirty_end->second.journal_sector < bs->journal.used_start))
// We can't flush journal sectors that are still written to
// However, as we group flushes by oid, current oid may have older writes to flush!
// And it may even block writes if we don't flush the older version
// (if it's in the beginning of the journal)...
// So first try to find an older version of the same object to flush.
bool found = false;
while (dirty_end != bs->dirty_db.begin())
if (dirty_end->first.oid != cur.oid)
if (!(dirty_end->second.journal_sector >= bs->journal.dirty_start &&
(bs->journal.dirty_start >= bs->journal.used_start ||
dirty_end->second.journal_sector < bs->journal.used_start)))
found = true;
cur.version = dirty_end->first.version;
if (!found)
// Try other objects
int search_left = flusher->flush_queue.size() - 1;
printf("Flusher overran writers (dirty_start=%08lx) - searching for older flushes (%d left)\n", bs->journal.dirty_start, search_left); printf("Flushing %lu:%lu v%lu\n", cur.oid.inode, cur.oid.stripe, cur.version);
while (search_left > 0)
cur.oid = flusher->flush_queue.front();
cur.version = flusher->flush_versions[cur.oid];
dirty_end = bs->dirty_db.find(cur);
if (dirty_end != bs->dirty_db.end())
if (dirty_end->second.journal_sector >= bs->journal.dirty_start &&
(bs->journal.dirty_start >= bs->journal.used_start ||
dirty_end->second.journal_sector < bs->journal.used_start))
printf("Write %lx:%lx v%lu is too new: offset=%08lx\n", cur.oid.inode, cur.oid.stripe, cur.version, dirty_end->second.journal_sector);
repeat_it = flusher->sync_to_repeat.find(cur.oid);
if (repeat_it == flusher->sync_to_repeat.end())
flusher->sync_to_repeat[cur.oid] = 0;
if (search_left <= 0)
printf("No older flushes, stopping\n");
goto stop_flusher;
printf("Flushing %lx:%lx v%lu\n", cur.oid.inode, cur.oid.stripe, cur.version);
#endif #endif
flusher->active_flushers++; flusher->active_flushers++;
resume_1: resume_1:
// Find it in clean_db
clean_it = bs->clean_db.find(cur.oid);
old_clean_loc = (clean_it != bs->clean_db.end() ? clean_it->second.location : UINT64_MAX);
// Scan dirty versions of the object // Scan dirty versions of the object
if (!scan_dirty(1)) if (!scan_dirty(1))
{ {
wait_state += 1; wait_state += 1;
return false; return false;
} }
// Writes and deletes shouldn't happen at the same time if (copy_count == 0 && clean_loc == UINT64_MAX && !has_delete && !has_empty)
assert(!has_writes || !has_delete);
if (!has_writes && !has_delete || has_delete && old_clean_loc == UINT64_MAX)
{ {
// Nothing to flush // Nothing to flush
bs->erase_dirty(dirty_start, std::next(dirty_end), clean_loc); flusher->active_flushers--;
goto release_oid; repeat_it = flusher->sync_to_repeat.find(cur.oid);
if (repeat_it != flusher->sync_to_repeat.end() && repeat_it->second > cur.version)
// Requeue version
flusher->unshift_flush({ .oid = cur.oid, .version = repeat_it->second });
wait_state = 0;
goto resume_0;
} }
// Find it in clean_db
clean_it = bs->clean_db.find(cur.oid);
old_clean_loc = (clean_it != bs->clean_db.end() ? clean_it->second.location : UINT64_MAX);
if (clean_loc == UINT64_MAX) if (clean_loc == UINT64_MAX)
{ {
if (old_clean_loc == UINT64_MAX) if (copy_count > 0 && has_delete || old_clean_loc == UINT64_MAX)
{ {
// Object not allocated. This is a bug. // Object not allocated. This is a bug.
char err[1024]; char err[1024];
snprintf( snprintf(
err, 1024, "BUG: Object %lx:%lx v%lu that we are trying to flush is not allocated on the data device", err, 1024, "BUG: Object %lu:%lu v%lu that we are trying to flush is not allocated on the data device",
cur.oid.inode, cur.oid.stripe, cur.version cur.oid.inode, cur.oid.stripe, cur.version
); );
throw std::runtime_error(err); throw std::runtime_error(err);
@ -426,18 +293,18 @@ resume_1:
{ {
new_clean_bitmap = (bs->inmemory_meta new_clean_bitmap = (bs->inmemory_meta
? meta_new.buf + meta_new.pos*bs->clean_entry_size + sizeof(clean_disk_entry) ? meta_new.buf + meta_new.pos*bs->clean_entry_size + sizeof(clean_disk_entry)
: bs->clean_bitmap + (clean_loc >> bs->block_order)*(2*bs->clean_entry_bitmap_size)); : bs->clean_bitmap + (clean_loc >> bs->block_order)*bs->clean_entry_bitmap_size);
if (clean_init_bitmap) if (clean_init_bitmap)
{ {
memset(new_clean_bitmap, 0, bs->clean_entry_bitmap_size); memset(new_clean_bitmap, 0, bs->clean_entry_bitmap_size);
bitmap_set(new_clean_bitmap, clean_bitmap_offset, clean_bitmap_len, bs->bitmap_granularity); bitmap_set(new_clean_bitmap, clean_bitmap_offset, clean_bitmap_len);
} }
} }
for (it = v.begin(); it != v.end(); it++) for (it = v.begin(); it != v.end(); it++)
{ {
if (new_clean_bitmap) if (new_clean_bitmap)
{ {
bitmap_set(new_clean_bitmap, it->offset, it->len, bs->bitmap_granularity); bitmap_set(new_clean_bitmap, it->offset, it->len);
} }
await_sqe(4); await_sqe(4);
data->iov = (struct iovec){ it->buf, (size_t)it->len }; data->iov = (struct iovec){ it->buf, (size_t)it->len };
@ -471,7 +338,6 @@ resume_1:
wait_state = 5; wait_state = 5;
return false; return false;
} }
// zero out old metadata entry
memset(meta_old.buf + meta_old.pos*bs->clean_entry_size, 0, bs->clean_entry_size); memset(meta_old.buf + meta_old.pos*bs->clean_entry_size, 0, bs->clean_entry_size);
await_sqe(15); await_sqe(15);
data->iov = (struct iovec){ meta_old.buf, bs->meta_block_size }; data->iov = (struct iovec){ meta_old.buf, bs->meta_block_size };
@ -483,30 +349,18 @@ resume_1:
} }
if (has_delete) if (has_delete)
{ {
// zero out new metadata entry
memset(meta_new.buf + meta_new.pos*bs->clean_entry_size, 0, bs->clean_entry_size); memset(meta_new.buf + meta_new.pos*bs->clean_entry_size, 0, bs->clean_entry_size);
} }
else else
{ {
clean_disk_entry *new_entry = (clean_disk_entry*)(meta_new.buf + meta_new.pos*bs->clean_entry_size); clean_disk_entry *new_entry = (clean_disk_entry*)(meta_new.buf + meta_new.pos*bs->clean_entry_size);
if (new_entry->oid.inode != 0 && new_entry->oid != cur.oid) assert(new_entry->oid.inode == 0 || new_entry->oid == cur.oid);
printf("Fatal error (metadata corruption or bug): tried to overwrite non-zero metadata entry %lu (%lx:%lx) with %lx:%lx\n",
clean_loc >> bs->block_order, new_entry->oid.inode, new_entry->oid.stripe, cur.oid.inode, cur.oid.stripe);
new_entry->oid = cur.oid; new_entry->oid = cur.oid;
new_entry->version = cur.version; new_entry->version = cur.version;
if (!bs->inmemory_meta) if (!bs->inmemory_meta)
{ {
memcpy(&new_entry->bitmap, new_clean_bitmap, bs->clean_entry_bitmap_size); memcpy(&new_entry->bitmap, new_clean_bitmap, bs->clean_entry_bitmap_size);
} }
// copy latest external bitmap/attributes
if (bs->clean_entry_bitmap_size)
void *bmp_ptr = bs->clean_entry_bitmap_size > sizeof(void*) ? dirty_end->second.bitmap : &dirty_end->second.bitmap;
memcpy((void*)(new_entry+1) + bs->clean_entry_bitmap_size, bmp_ptr, bs->clean_entry_bitmap_size);
} }
await_sqe(6); await_sqe(6);
data->iov = (struct iovec){ meta_new.buf, bs->meta_block_size }; data->iov = (struct iovec){ meta_new.buf, bs->meta_block_size };
@ -556,35 +410,13 @@ resume_1:
} }
// Update clean_db and dirty_db, free old data locations // Update clean_db and dirty_db, free old data locations
update_clean_db(); update_clean_db();
printf("Flushed %lx:%lx v%lu (%d copies, wr:%d, del:%d), %ld left\n", cur.oid.inode, cur.oid.stripe, cur.version,
copy_count, has_writes, has_delete, flusher->flush_queue.size());
repeat_it = flusher->sync_to_repeat.find(cur.oid);
if (repeat_it != flusher->sync_to_repeat.end() && repeat_it->second > cur.version)
// Requeue version
flusher->unshift_flush({ .oid = cur.oid, .version = repeat_it->second }, false);
// Clear unused part of the journal every <journal_trim_interval> flushes // Clear unused part of the journal every <journal_trim_interval> flushes
if (!((++flusher->journal_trim_counter) % flusher->journal_trim_interval) || flusher->trim_wanted > 0) if (!((++flusher->journal_trim_counter) % flusher->journal_trim_interval))
{ {
flusher->journal_trim_counter = 0; flusher->journal_trim_counter = 0;
new_trim_pos = bs->journal.get_trim_pos(); if (bs->journal.trim())
if (new_trim_pos != bs->journal.used_start)
{ {
resume_19: // Update journal "superblock"
// Wait for other coroutines trimming the journal, if any
if (flusher->trimming)
wait_state = 19;
return false;
flusher->trimming = true;
// First update journal "superblock" and only then update <used_start> in memory
await_sqe(12); await_sqe(12);
*((journal_entry_start*)flusher->journal_superblock) = { *((journal_entry_start*)flusher->journal_superblock) = {
.crc32 = 0, .crc32 = 0,
@ -592,7 +424,7 @@ resume_1:
.type = JE_START, .type = JE_START,
.size = sizeof(journal_entry_start), .size = sizeof(journal_entry_start),
.reserved = 0, .reserved = 0,
.journal_start = new_trim_pos, .journal_start = bs->journal.used_start,
}; };
((journal_entry_start*)flusher->journal_superblock)->crc32 = je_crc32((journal_entry*)flusher->journal_superblock); ((journal_entry_start*)flusher->journal_superblock)->crc32 = je_crc32((journal_entry*)flusher->journal_superblock);
data->iov = (struct iovec){ flusher->journal_superblock, bs->journal_block_size }; data->iov = (struct iovec){ flusher->journal_superblock, bs->journal_block_size };
@ -605,28 +437,20 @@ resume_1:
wait_state = 13; wait_state = 13;
return false; return false;
} }
if (!bs->disable_journal_fsync)
my_uring_prep_fsync(sqe, bs->journal.fd, IORING_FSYNC_DATASYNC);
data->iov = { 0 };
data->callback = simple_callback_w;
if (wait_count > 0)
wait_state = 21;
return false;
bs->journal.used_start = new_trim_pos;
printf("Journal trimmed to %08lx (next_free=%08lx)\n", bs->journal.used_start, bs->journal.next_free);
flusher->trimming = false;
} }
} }
// All done // All done
printf("Flushed %lu:%lu v%lu\n", cur.oid.inode, cur.oid.stripe, cur.version);
flusher->active_flushers--; flusher->active_flushers--;
repeat_it = flusher->sync_to_repeat.find(cur.oid);
if (repeat_it != flusher->sync_to_repeat.end() && repeat_it->second > cur.version)
// Requeue version
flusher->unshift_flush({ .oid = cur.oid, .version = repeat_it->second });
wait_state = 0; wait_state = 0;
goto resume_0; goto resume_0;
} }
@ -645,25 +469,19 @@ bool journal_flusher_co::scan_dirty(int wait_base)
copy_count = 0; copy_count = 0;
clean_loc = UINT64_MAX; clean_loc = UINT64_MAX;
has_delete = false; has_delete = false;
has_writes = false; has_empty = false;
skip_copy = false; skip_copy = false;
clean_init_bitmap = false; clean_init_bitmap = false;
while (1) while (1)
{ {
if (!IS_STABLE(dirty_it->second.state)) if (dirty_it->second.state == ST_J_STABLE && !skip_copy)
char err[1024];
err, 1024, "BUG: Unexpected dirty_entry %lx:%lx v%lu unstable state during flush: %d",
dirty_it->first.oid.inode, dirty_it->first.oid.stripe, dirty_it->first.version, dirty_it->second.state
throw std::runtime_error(err);
else if (IS_JOURNAL(dirty_it->second.state) && !skip_copy)
{ {
// First we submit all reads // First we submit all reads
has_writes = true; if (dirty_it->second.len == 0)
if (dirty_it->second.len != 0) {
has_empty = true;
{ {
offset = dirty_it->second.offset; offset = dirty_it->second.offset;
end_offset = dirty_it->second.offset + dirty_it->second.len; end_offset = dirty_it->second.offset + dirty_it->second.len;
@ -677,18 +495,18 @@ bool journal_flusher_co::scan_dirty(int wait_base)
{ {
submit_offset = dirty_it->second.location + offset - dirty_it->second.offset; submit_offset = dirty_it->second.location + offset - dirty_it->second.offset;
submit_len = it == v.end() || it->offset >= end_offset ? end_offset-offset : it->offset-offset; submit_len = it == v.end() || it->offset >= end_offset ? end_offset-offset : it->offset-offset;
it = v.insert(it, (copy_buffer_t){ .offset = offset, .len = submit_len, .buf = memalign_or_die(MEM_ALIGNMENT, submit_len) }); it = v.insert(it, (copy_buffer_t){ .offset = offset, .len = submit_len, .buf = memalign(MEM_ALIGNMENT, submit_len) });
copy_count++; copy_count++;
if (bs->journal.inmemory) if (bs->journal.inmemory)
{ {
// Take it from memory // Take it from memory
memcpy(it->buf, bs->journal.buffer + submit_offset, submit_len); memcpy(v.back().buf, bs->journal.buffer + submit_offset, submit_len);
} }
else else
{ {
// Read it from disk // Read it from disk
await_sqe(0); await_sqe(0);
data->iov = (struct iovec){ it->buf, (size_t)submit_len }; data->iov = (struct iovec){ v.back().buf, (size_t)submit_len };
data->callback = simple_callback_r; data->callback = simple_callback_r;
my_uring_prep_readv( my_uring_prep_readv(
sqe, bs->journal.fd, &data->iov, 1, bs->journal.offset + submit_offset sqe, bs->journal.fd, &data->iov, 1, bs->journal.offset + submit_offset
@ -702,22 +520,30 @@ bool journal_flusher_co::scan_dirty(int wait_base)
} }
} }
} }
else if (IS_BIG_WRITE(dirty_it->second.state) && !skip_copy) else if (dirty_it->second.state == ST_D_STABLE && !skip_copy)
{ {
// There is an unflushed big write. Copy small writes in its position // There is an unflushed big write. Copy small writes in its position
has_writes = true;
clean_loc = dirty_it->second.location; clean_loc = dirty_it->second.location;
clean_init_bitmap = true; clean_init_bitmap = true;
clean_bitmap_offset = dirty_it->second.offset; clean_bitmap_offset = dirty_it->second.offset;
clean_bitmap_len = dirty_it->second.len; clean_bitmap_len = dirty_it->second.len;
skip_copy = true; skip_copy = true;
} }
else if (IS_DELETE(dirty_it->second.state) && !skip_copy) else if (dirty_it->second.state == ST_DEL_STABLE && !skip_copy)
{ {
// There is an unflushed delete // There is an unflushed delete
has_delete = true; has_delete = true;
skip_copy = true; skip_copy = true;
} }
else if (!IS_STABLE(dirty_it->second.state))
char err[1024];
err, 1024, "BUG: Unexpected dirty_entry %lu:%lu v%lu state during flush: %d",
dirty_it->first.oid.inode, dirty_it->first.oid.stripe, dirty_it->first.version, dirty_it->second.state
throw std::runtime_error(err);
dirty_start = dirty_it; dirty_start = dirty_it;
if (dirty_it == bs->dirty_db.begin()) if (dirty_it == bs->dirty_db.begin())
{ {
@ -753,7 +579,7 @@ bool journal_flusher_co::modify_meta_read(uint64_t meta_loc, flusher_meta_write_
if ( == flusher->meta_sectors.end()) if ( == flusher->meta_sectors.end())
{ {
// Not in memory yet, read it // Not in memory yet, read it
wr.buf = memalign_or_die(MEM_ALIGNMENT, bs->meta_block_size); wr.buf = memalign(MEM_ALIGNMENT, bs->meta_block_size); = flusher->meta_sectors.emplace(wr.sector, (meta_sector_t){ = flusher->meta_sectors.emplace(wr.sector, (meta_sector_t){
.offset = wr.sector, .offset = wr.sector,
.len = bs->meta_block_size, .len = bs->meta_block_size,
@ -783,7 +609,7 @@ void journal_flusher_co::update_clean_db()
if (old_clean_loc != UINT64_MAX && old_clean_loc != clean_loc) if (old_clean_loc != UINT64_MAX && old_clean_loc != clean_loc)
{ {
printf("Free block %lu (new location is %lu)\n", old_clean_loc >> bs->block_order, clean_loc >> bs->block_order); printf("Free block %lu\n", old_clean_loc >> bs->block_order);
#endif #endif
bs->data_alloc->set(old_clean_loc >> bs->block_order, false); bs->data_alloc->set(old_clean_loc >> bs->block_order, false);
} }
@ -831,34 +657,31 @@ bool journal_flusher_co::fsync_batch(bool fsync_meta, int wait_base)
sync_found: sync_found:
cur_sync->ready_count++; cur_sync->ready_count++;
flusher->syncing_flushers++; flusher->syncing_flushers++;
resume_1: if (flusher->syncing_flushers >= flusher->flusher_count || !flusher->flush_queue.size())
if (!cur_sync->state)
{ {
if (flusher->syncing_flushers >= flusher->cur_flusher_count || !flusher->flush_queue.size()) // Sync batch is ready. Do it.
data->iov = { 0 };
data->callback = simple_callback_w;
my_uring_prep_fsync(sqe, fsync_meta ? bs->meta_fd : bs->data_fd, IORING_FSYNC_DATASYNC);
cur_sync->state = 1;
if (wait_count > 0)
{ {
// Sync batch is ready. Do it.
data->iov = { 0 };
data->callback = simple_callback_w;
my_uring_prep_fsync(sqe, fsync_meta ? bs->meta_fd : bs->data_fd, IORING_FSYNC_DATASYNC);
cur_sync->state = 1;
if (wait_count > 0)
wait_state = 2;
return false;
// Sync completed. All previous coroutines waiting for it must be resumed
cur_sync->state = 2;
// Wait until someone else sends and completes a sync.
wait_state = 1; wait_state = 1;
return false; return false;
} }
// Sync completed. All previous coroutines waiting for it must be resumed
cur_sync->state = 2;
// Wait until someone else sends and completes a sync.
if (!cur_sync->state)
wait_state = 2;
return false;
} }
flusher->syncing_flushers--; flusher->syncing_flushers--;
cur_sync->ready_count--; cur_sync->ready_count--;
@ -869,3 +692,35 @@ bool journal_flusher_co::fsync_batch(bool fsync_meta, int wait_base)
} }
return true; return true;
} }
void journal_flusher_co::bitmap_set(void *bitmap, uint64_t start, uint64_t len)
if (start == 0)
if (len == 32*bs->bitmap_granularity)
*((uint32_t*)bitmap) = UINT32_MAX;
else if (len == 64*bs->bitmap_granularity)
*((uint64_t*)bitmap) = UINT64_MAX;
unsigned bit_start = start / bs->bitmap_granularity;
unsigned bit_end = ((start + len) + bs->bitmap_granularity - 1) / bs->bitmap_granularity;
while (bit_start < bit_end)
if (!(bit_start & 7) && bit_end >= bit_start+8)
((uint8_t*)bitmap)[bit_start / 8] = UINT8_MAX;
bit_start += 8;
((uint8_t*)bitmap)[bit_start / 8] |= 1 << (bit_start % 8);

@ -1,6 +1,3 @@
// Copyright (c) Vitaliy Filippov, 2019+
// License: VNPL-1.1 (see for details)
struct copy_buffer_t struct copy_buffer_t
{ {
uint64_t offset, len; uint64_t offset, len;
@ -48,7 +45,7 @@ class journal_flusher_co
std::map<object_id, uint64_t>::iterator repeat_it; std::map<object_id, uint64_t>::iterator repeat_it;
std::function<void(ring_data_t*)> simple_callback_r, simple_callback_w; std::function<void(ring_data_t*)> simple_callback_r, simple_callback_w;
bool skip_copy, has_delete, has_writes; bool skip_copy, has_delete, has_empty;
blockstore_clean_db_t::iterator clean_it; blockstore_clean_db_t::iterator clean_it;
std::vector<copy_buffer_t> v; std::vector<copy_buffer_t> v;
std::vector<copy_buffer_t>::iterator it; std::vector<copy_buffer_t>::iterator it;
@ -59,8 +56,6 @@ class journal_flusher_co
uint64_t clean_bitmap_offset, clean_bitmap_len; uint64_t clean_bitmap_offset, clean_bitmap_len;
void *new_clean_bitmap; void *new_clean_bitmap;
uint64_t new_trim_pos;
// local: scan_dirty() // local: scan_dirty()
uint64_t offset, end_offset, submit_offset, submit_len; uint64_t offset, end_offset, submit_offset, submit_len;
@ -69,6 +64,7 @@ class journal_flusher_co
bool modify_meta_read(uint64_t meta_loc, flusher_meta_write_t &wr, int wait_base); bool modify_meta_read(uint64_t meta_loc, flusher_meta_write_t &wr, int wait_base);
void update_clean_db(); void update_clean_db();
bool fsync_batch(bool fsync_meta, int wait_base); bool fsync_batch(bool fsync_meta, int wait_base);
void bitmap_set(void *bitmap, uint64_t start, uint64_t len);
public: public:
journal_flusher_co(); journal_flusher_co();
bool loop(); bool loop();
@ -77,16 +73,14 @@ public:
// Journal flusher itself // Journal flusher itself
class journal_flusher_t class journal_flusher_t
{ {
int trim_wanted = 0;
bool dequeuing; bool dequeuing;
int flusher_count, cur_flusher_count, target_flusher_count; int flusher_count;
int flusher_start_threshold; int sync_threshold;
journal_flusher_co *co; journal_flusher_co *co;
blockstore_impl_t *bs; blockstore_impl_t *bs;
friend class journal_flusher_co; friend class journal_flusher_co;
int journal_trim_counter, journal_trim_interval; int journal_trim_counter, journal_trim_interval;
bool trimming;
void* journal_superblock; void* journal_superblock;
int active_flushers; int active_flushers;
@ -102,10 +96,7 @@ public:
~journal_flusher_t(); ~journal_flusher_t();
void loop(); void loop();
bool is_active(); bool is_active();
void mark_trim_possible(); void force_start();
void request_trim();
void release_trim();
void enqueue_flush(obj_ver_id oid); void enqueue_flush(obj_ver_id oid);
void unshift_flush(obj_ver_id oid, bool force); void unshift_flush(obj_ver_id oid);
void remove_flush(object_id oid);
}; };

@ -1,6 +1,3 @@
// Copyright (c) Vitaliy Filippov, 2019+
// License: VNPL-1.1 (see for details)
#include "blockstore_impl.h" #include "blockstore_impl.h"
blockstore_impl_t::blockstore_impl_t(blockstore_config_t & config, ring_loop_t *ringloop) blockstore_impl_t::blockstore_impl_t(blockstore_config_t & config, ring_loop_t *ringloop)
@ -10,9 +7,9 @@ blockstore_impl_t::blockstore_impl_t(blockstore_config_t & config, ring_loop_t *
ring_consumer.loop = [this]() { loop(); }; ring_consumer.loop = [this]() { loop(); };
ringloop->register_consumer(&ring_consumer); ringloop->register_consumer(&ring_consumer);
initialized = 0; initialized = 0;
zero_object = (uint8_t*)memalign(MEM_ALIGNMENT, block_size);
data_fd = meta_fd = journal.fd = -1; data_fd = meta_fd = journal.fd = -1;
parse_config(config); parse_config(config);
zero_object = (uint8_t*)memalign_or_die(MEM_ALIGNMENT, block_size);
try try
{ {
open_data(); open_data();
@ -101,27 +98,45 @@ void blockstore_impl_t::loop()
{ {
// try to submit ops // try to submit ops
unsigned initial_ring_space = ringloop->space_left(); unsigned initial_ring_space = ringloop->space_left();
// has_writes == 0 - no writes before the current queue item // FIXME: rework this "sync polling"
// has_writes == 1 - some writes in progress auto cur_sync = in_progress_syncs.begin();
// has_writes == 2 - tried to submit some writes, but failed while (cur_sync != in_progress_syncs.end())
int has_writes = 0, op_idx = 0, new_idx = 0;
for (; op_idx < submit_queue.size(); op_idx++, new_idx++)
{ {
auto op = submit_queue[op_idx]; if (continue_sync(*cur_sync) != 2)
submit_queue[new_idx] = op; {
// List is unmodified
cur_sync = in_progress_syncs.begin();
auto cur = submit_queue.begin();
int has_writes = 0;
while (cur != submit_queue.end())
auto op_ptr = cur;
auto op = *(cur++);
// FIXME: This needs some simplification // FIXME: This needs some simplification
// Writes should not block reads if the ring is not full and reads don't depend on them // Writes should not block reads if the ring is not full and reads don't depend on them
// In all other cases we should stop submission // In all other cases we should stop submission
if (PRIV(op)->wait_for) if (PRIV(op)->wait_for)
{ {
check_wait(op); check_wait(op);
if (PRIV(op)->wait_for)
printf("still waiting for %d\n", PRIV(op)->wait_for);
if (PRIV(op)->wait_for == WAIT_SQE) if (PRIV(op)->wait_for == WAIT_SQE)
{ {
break; break;
} }
else if (PRIV(op)->wait_for) else if (PRIV(op)->wait_for)
{ {
if (op->opcode == BS_OP_WRITE || op->opcode == BS_OP_WRITE_STABLE || op->opcode == BS_OP_DELETE) if (op->opcode == BS_OP_WRITE || op->opcode == BS_OP_DELETE)
{ {
has_writes = 2; has_writes = 2;
} }
@ -130,33 +145,20 @@ void blockstore_impl_t::loop()
} }
unsigned ring_space = ringloop->space_left(); unsigned ring_space = ringloop->space_left();
unsigned prev_sqe_pos = ringloop->save(); unsigned prev_sqe_pos = ringloop->save();
// 0 = can't submit bool dequeue_op = false;
// 1 = in progress
// 2 = can be removed from queue
int wr_st = 0;
if (op->opcode == BS_OP_READ) if (op->opcode == BS_OP_READ)
{ {
wr_st = dequeue_read(op); dequeue_op = dequeue_read(op);
} }
else if (op->opcode == BS_OP_WRITE || op->opcode == BS_OP_WRITE_STABLE) else if (op->opcode == BS_OP_WRITE || op->opcode == BS_OP_DELETE)
{ {
if (has_writes == 2) if (has_writes == 2)
{ {
// Some writes already could not be submitted // Some writes could not be submitted
continue; break;
} }
wr_st = dequeue_write(op); dequeue_op = dequeue_write(op);
has_writes = wr_st > 0 ? 1 : 2; has_writes = dequeue_op ? 1 : 2;
else if (op->opcode == BS_OP_DELETE)
if (has_writes == 2)
// Some writes already could not be submitted
wr_st = dequeue_del(op);
has_writes = wr_st > 0 ? 1 : 2;
} }
else if (op->opcode == BS_OP_SYNC) else if (op->opcode == BS_OP_SYNC)
{ {
@ -169,31 +171,43 @@ void blockstore_impl_t::loop()
// Can't submit SYNC before previous writes // Can't submit SYNC before previous writes
continue; continue;
} }
wr_st = continue_sync(op, false); dequeue_op = dequeue_sync(op);
if (wr_st != 2)
has_writes = wr_st > 0 ? 1 : 2;
} }
else if (op->opcode == BS_OP_STABLE) else if (op->opcode == BS_OP_STABLE)
{ {
wr_st = dequeue_stable(op); if (has_writes == 2)
// Don't submit additional flushes before completing previous LISTs
dequeue_op = dequeue_stable(op);
} }
else if (op->opcode == BS_OP_ROLLBACK) else if (op->opcode == BS_OP_ROLLBACK)
{ {
wr_st = dequeue_rollback(op); if (has_writes == 2)
// Don't submit additional flushes before completing previous LISTs
dequeue_op = dequeue_rollback(op);
} }
else if (op->opcode == BS_OP_LIST) else if (op->opcode == BS_OP_LIST)
{ {
// LIST doesn't need to be blocked by previous modifications // Block LIST operation by previous modifications,
process_list(op); // so it always returns a consistent state snapshot
wr_st = 2; if (has_writes == 2 || inflight_writes > 0)
has_writes = 2;
dequeue_op = true;
} }
if (wr_st == 2) if (dequeue_op)
{ {
new_idx--; submit_queue.erase(op_ptr);
} }
if (wr_st == 0) else
{ {
ringloop->restore(prev_sqe_pos); ringloop->restore(prev_sqe_pos);
if (PRIV(op)->wait_for == WAIT_SQE) if (PRIV(op)->wait_for == WAIT_SQE)
@ -204,14 +218,6 @@ void blockstore_impl_t::loop()
} }
} }
} }
if (op_idx != new_idx)
while (op_idx < submit_queue.size())
submit_queue[new_idx++] = submit_queue[op_idx++];
if (!readonly) if (!readonly)
{ {
flusher->loop(); flusher->loop();
@ -234,7 +240,7 @@ bool blockstore_impl_t::is_safe_to_stop()
{ {
// It's safe to stop blockstore when there are no in-flight operations, // It's safe to stop blockstore when there are no in-flight operations,
// no in-progress syncs and flusher isn't doing anything // no in-progress syncs and flusher isn't doing anything
if (submit_queue.size() > 0 || !readonly && flusher->is_active()) if (submit_queue.size() > 0 || in_progress_syncs.size() > 0 || !readonly && flusher->is_active())
{ {
return false; return false;
} }
@ -265,9 +271,6 @@ void blockstore_impl_t::check_wait(blockstore_op_t *op)
if (ringloop->space_left() < PRIV(op)->wait_detail) if (ringloop->space_left() < PRIV(op)->wait_detail)
{ {
// stop submission if there's still no free space // stop submission if there's still no free space
printf("Still waiting for %lu SQE(s)\n", PRIV(op)->wait_detail);
return; return;
} }
PRIV(op)->wait_for = 0; PRIV(op)->wait_for = 0;
@ -277,35 +280,25 @@ void blockstore_impl_t::check_wait(blockstore_op_t *op)
if (journal.used_start == PRIV(op)->wait_detail) if (journal.used_start == PRIV(op)->wait_detail)
{ {
// do not submit // do not submit
printf("Still waiting to flush journal offset %08lx\n", PRIV(op)->wait_detail);
return; return;
} }
PRIV(op)->wait_for = 0; PRIV(op)->wait_for = 0;
} }
else if (PRIV(op)->wait_for == WAIT_JOURNAL_BUFFER) else if (PRIV(op)->wait_for == WAIT_JOURNAL_BUFFER)
{ {
int next = ((journal.cur_sector + 1) % journal.sector_count); int next = ((journal.cur_sector + 1) % journal.sector_count);
if (journal.sector_info[next].flush_count > 0 || if (journal.sector_info[next].usage_count > 0 ||
journal.sector_info[next].dirty) journal.sector_info[next].dirty)
{ {
// do not submit // do not submit
printf("Still waiting for a journal buffer\n");
return; return;
} }
PRIV(op)->wait_for = 0; PRIV(op)->wait_for = 0;
} }
else if (PRIV(op)->wait_for == WAIT_FREE) else if (PRIV(op)->wait_for == WAIT_FREE)
{ {
if (!data_alloc->get_free_count() && flusher->is_active()) if (!data_alloc->get_free_count() && !flusher->is_active())
{ {
printf("Still waiting for free space on the data device\n");
return; return;
} }
PRIV(op)->wait_for = 0; PRIV(op)->wait_for = 0;
@ -316,15 +309,16 @@ void blockstore_impl_t::check_wait(blockstore_op_t *op)
} }
} }
void blockstore_impl_t::enqueue_op(blockstore_op_t *op) void blockstore_impl_t::enqueue_op(blockstore_op_t *op, bool first)
{ {
if (op->opcode < BS_OP_MIN || op->opcode > BS_OP_MAX || if (op->opcode < BS_OP_MIN || op->opcode > BS_OP_MAX ||
((op->opcode == BS_OP_READ || op->opcode == BS_OP_WRITE || op->opcode == BS_OP_WRITE_STABLE) && ( ((op->opcode == BS_OP_READ || op->opcode == BS_OP_WRITE) && (
op->offset >= block_size || op->offset >= block_size ||
op->len > block_size-op->offset || op->len > block_size-op->offset ||
(op->len % disk_alignment) (op->len % disk_alignment)
)) || )) ||
readonly && op->opcode != BS_OP_READ && op->opcode != BS_OP_LIST) readonly && op->opcode != BS_OP_READ && op->opcode != BS_OP_LIST ||
first && op->opcode == BS_OP_WRITE)
{ {
// Basic verification not passed // Basic verification not passed
op->retval = -EINVAL; op->retval = -EINVAL;
@ -369,215 +363,109 @@ void blockstore_impl_t::enqueue_op(blockstore_op_t *op)
} }
}; };
} }
if ((op->opcode == BS_OP_WRITE || op->opcode == BS_OP_WRITE_STABLE || op->opcode == BS_OP_DELETE) && !enqueue_write(op)) if (op->opcode == BS_OP_WRITE && !enqueue_write(op))
{ {
std::function<void (blockstore_op_t*)>(op->callback)(op); std::function<void (blockstore_op_t*)>(op->callback)(op);
return; return;
} }
if (op->opcode == BS_OP_SYNC && immediate_commit == IMMEDIATE_ALL)
op->retval = 0;
std::function<void (blockstore_op_t*)>(op->callback)(op);
// Call constructor without allocating memory. We'll call destructor before returning op back // Call constructor without allocating memory. We'll call destructor before returning op back
new ((void*)op->private_data) blockstore_op_private_t; new ((void*)op->private_data) blockstore_op_private_t;
PRIV(op)->wait_for = 0; PRIV(op)->wait_for = 0;
PRIV(op)->op_state = 0; PRIV(op)->op_state = 0;
PRIV(op)->pending_ops = 0; PRIV(op)->pending_ops = 0;
submit_queue.push_back(op); if (!first)
static bool replace_stable(object_id oid, uint64_t version, int search_start, int search_end, obj_ver_id* list)
while (search_start < search_end)
{ {
int pos = search_start+(search_end-search_start)/2; submit_queue.push_back(op);
if (oid < list[pos].oid)
search_end = pos;
else if (list[pos].oid < oid)
search_start = pos+1;
list[pos].version = version;
return true;
} }
return false; else
} }
void blockstore_impl_t::process_list(blockstore_op_t *op) void blockstore_impl_t::process_list(blockstore_op_t *op)
{ {
// Count objects
uint32_t list_pg = op->offset; uint32_t list_pg = op->offset;
uint32_t pg_count = op->len; uint32_t pg_count = op->len;
uint64_t pg_stripe_size = op->oid.stripe; uint64_t pg_stripe_size = op->oid.stripe;
uint64_t min_inode = op->oid.inode;
uint64_t max_inode = op->version;
// Check PG
if (pg_count != 0 && (pg_stripe_size < MIN_BLOCK_SIZE || list_pg >= pg_count)) if (pg_count != 0 && (pg_stripe_size < MIN_BLOCK_SIZE || list_pg >= pg_count))
{ {
op->retval = -EINVAL; op->retval = -EINVAL;
return; return;
} }
// Copy clean_db entries (sorted) uint64_t stable_count = 0;
int stable_count = 0, stable_alloc = clean_db.size() / (pg_count ? pg_count : 1); if (pg_count > 0)
obj_ver_id *stable = (obj_ver_id*)malloc(sizeof(obj_ver_id) * stable_alloc); {
if (!stable) for (auto it = clean_db.begin(); it != clean_db.end(); it++)
uint32_t pg = (it->first.inode + it->first.stripe / pg_stripe_size) % pg_count;
if (pg == list_pg)
stable_count = clean_db.size();
uint64_t total_count = stable_count;
for (auto it = dirty_db.begin(); it != dirty_db.end(); it++)
if (!pg_count || ((it->first.oid.inode + it->first.oid.stripe / pg_stripe_size) % pg_count) == list_pg)
if (IS_STABLE(it->second.state))
// Allocate memory
op->version = stable_count;
op->retval = total_count;
op->buf = malloc(sizeof(obj_ver_id) * total_count);
if (!op->buf)
{ {
op->retval = -ENOMEM; op->retval = -ENOMEM;
return; return;
} }
obj_ver_id *vers = (obj_ver_id*)op->buf;
int i = 0;
for (auto it = clean_db.begin(); it != clean_db.end(); it++)
{ {
auto clean_it = clean_db.begin(), clean_end = clean_db.end(); if (!pg_count || ((it->first.inode + it->first.stripe / pg_stripe_size) % pg_count) == list_pg)
if ((min_inode != 0 || max_inode != 0) && min_inode <= max_inode)
{ {
clean_it = clean_db.lower_bound({ vers[i++] = {
.inode = min_inode, .oid = it->first,
.stripe = 0, .version = it->second.version,
}); };
clean_end = clean_db.upper_bound({
.inode = max_inode,
.stripe = UINT64_MAX,
} }
for (; clean_it != clean_end; clean_it++) }
int j = stable_count;
for (auto it = dirty_db.begin(); it != dirty_db.end(); it++)
if (!pg_count || ((it->first.oid.inode + it->first.oid.stripe / pg_stripe_size) % pg_count) == list_pg)
{ {
if (!pg_count || ((clean_it->first.inode + clean_it->first.stripe / pg_stripe_size) % pg_count) == list_pg) if (IS_STABLE(it->second.state))
{ {
if (stable_count >= stable_alloc) vers[i++] = it->first;
{ }
stable_alloc += 32768; else
stable = (obj_ver_id*)realloc(stable, sizeof(obj_ver_id) * stable_alloc); {
if (!stable) vers[j++] = it->first;
op->retval = -ENOMEM;
stable[stable_count++] = {
.oid = clean_it->first,
.version = clean_it->second.version,
} }
} }
} }
int clean_stable_count = stable_count;
// Copy dirty_db entries (sorted, too)
int unstable_count = 0, unstable_alloc = 0;
obj_ver_id *unstable = NULL;
auto dirty_it = dirty_db.begin(), dirty_end = dirty_db.end();
if ((min_inode != 0 || max_inode != 0) && min_inode <= max_inode)
dirty_it = dirty_db.lower_bound({
.oid = {
.inode = min_inode,
.stripe = 0,
.version = 0,
dirty_end = dirty_db.upper_bound({
.oid = {
.inode = max_inode,
.stripe = UINT64_MAX,
.version = UINT64_MAX,
for (; dirty_it != dirty_end; dirty_it++)
if (!pg_count || ((dirty_it->first.oid.inode + dirty_it->first.oid.stripe / pg_stripe_size) % pg_count) == list_pg)
if (IS_DELETE(dirty_it->second.state))
// Deletions are always stable, so try to zero out two possible entries
if (!replace_stable(dirty_it->first.oid, 0, 0, clean_stable_count, stable))
replace_stable(dirty_it->first.oid, 0, clean_stable_count, stable_count, stable);
else if (IS_STABLE(dirty_it->second.state))
// First try to replace a clean stable version in the first part of the list
if (!replace_stable(dirty_it->first.oid, dirty_it->first.version, 0, clean_stable_count, stable))
// Then try to replace the last dirty stable version in the second part of the list
if (stable_count > 0 && stable[stable_count-1].oid == dirty_it->first.oid)
stable[stable_count-1].version = dirty_it->first.version;
if (stable_count >= stable_alloc)
stable_alloc += 32768;
stable = (obj_ver_id*)realloc(stable, sizeof(obj_ver_id) * stable_alloc);
if (!stable)
if (unstable)
op->retval = -ENOMEM;
stable[stable_count++] = dirty_it->first;
if (unstable_count >= unstable_alloc)
unstable_alloc += 32768;
unstable = (obj_ver_id*)realloc(unstable, sizeof(obj_ver_id) * unstable_alloc);
if (!unstable)
if (stable)
op->retval = -ENOMEM;
unstable[unstable_count++] = dirty_it->first;
// Remove zeroed out stable entries
int j = 0;
for (int i = 0; i < stable_count; i++)
if (stable[i].version != 0)
stable[j++] = stable[i];
stable_count = j;
if (stable_count+unstable_count > stable_alloc)
stable_alloc = stable_count+unstable_count;
stable = (obj_ver_id*)realloc(stable, sizeof(obj_ver_id) * stable_alloc);
if (!stable)
if (unstable)
op->retval = -ENOMEM;
// Copy unstable entries
for (int i = 0; i < unstable_count; i++)
stable[j++] = unstable[i];
op->version = stable_count;
op->retval = stable_count+unstable_count;
op->buf = stable;
} }

View File

@ -1,6 +1,3 @@
// Copyright (c) Vitaliy Filippov, 2019+
// License: VNPL-1.1 (see for details)
#pragma once #pragma once
#include "blockstore.h" #include "blockstore.h"
@ -10,6 +7,7 @@
#include <sys/stat.h> #include <sys/stat.h>
#include <fcntl.h> #include <fcntl.h>
#include <unistd.h> #include <unistd.h>
#include <malloc.h>
#include <linux/fs.h> #include <linux/fs.h>
#include <vector> #include <vector>
@ -19,45 +17,56 @@
#include "cpp-btree/btree_map.h" #include "cpp-btree/btree_map.h"
#include "malloc_or_die.h"
#include "allocator.h" #include "allocator.h"
// States are not stored on disk. Instead, they're deduced from the journal // States are not stored on disk. Instead, they're deduced from the journal
// FIXME: Rename to BS_ST_*
#define BS_ST_SMALL_WRITE 0x01 #define ST_J_WAIT_BIG 1
#define BS_ST_BIG_WRITE 0x02 #define ST_J_IN_FLIGHT 2
#define BS_ST_DELETE 0x03 #define ST_J_SUBMITTED 3
#define ST_J_WRITTEN 4
#define ST_J_SYNCED 5
#define ST_J_STABLE 6
#define BS_ST_WAIT_DEL 0x10 #define ST_D_IN_FLIGHT 15
#define BS_ST_WAIT_BIG 0x20 #define ST_D_SUBMITTED 16
#define BS_ST_IN_FLIGHT 0x30 #define ST_D_WRITTEN 17
#define BS_ST_SUBMITTED 0x40 #define ST_D_SYNCED 20
#define BS_ST_WRITTEN 0x50 #define ST_D_STABLE 21
#define BS_ST_SYNCED 0x60
#define BS_ST_STABLE 0x70
#define BS_ST_INSTANT 0x100 #define ST_DEL_IN_FLIGHT 31
#define ST_DEL_WRITTEN 33
#define ST_DEL_SYNCED 34
#define ST_DEL_STABLE 35
#define ST_CURRENT 48
#define BS_ST_TYPE_MASK 0x0F #define IS_IN_FLIGHT(st) (st == ST_J_WAIT_BIG || st == ST_J_IN_FLIGHT || st == ST_D_IN_FLIGHT || st == ST_DEL_IN_FLIGHT || st == ST_J_SUBMITTED || st == ST_D_SUBMITTED || st == ST_DEL_SUBMITTED)
#define BS_ST_WORKFLOW_MASK 0xF0 #define IS_STABLE(st) (st == ST_J_STABLE || st == ST_D_STABLE || st == ST_DEL_STABLE || st == ST_CURRENT)
#define IS_IN_FLIGHT(st) (((st) & 0xF0) <= BS_ST_SUBMITTED) #define IS_SYNCED(st) (IS_STABLE(st) || st == ST_J_SYNCED || st == ST_D_SYNCED || st == ST_DEL_SYNCED)
#define IS_STABLE(st) (((st) & 0xF0) == BS_ST_STABLE) #define IS_JOURNAL(st) (st >= ST_J_WAIT_BIG && st <= ST_J_STABLE)
#define IS_SYNCED(st) (((st) & 0xF0) >= BS_ST_SYNCED) #define IS_BIG_WRITE(st) (st >= ST_D_IN_FLIGHT && st <= ST_D_STABLE)
#define IS_JOURNAL(st) (((st) & 0x0F) == BS_ST_SMALL_WRITE) #define IS_DELETE(st) (st >= ST_DEL_IN_FLIGHT && st <= ST_DEL_STABLE)
#define IS_BIG_WRITE(st) (((st) & 0x0F) == BS_ST_BIG_WRITE) #define IS_UNSYNCED(st) (st >= ST_J_WAIT_BIG && st <= ST_J_WRITTEN || st >= ST_D_IN_FLIGHT && st <= ST_D_WRITTEN|| st >= ST_DEL_IN_FLIGHT && st <= ST_DEL_WRITTEN)
#define IS_DELETE(st) (((st) & 0x0F) == BS_ST_DELETE)
#define BS_SUBMIT_GET_SQE(sqe, data) \ #define BS_SUBMIT_GET_SQE(sqe, data) \
struct ring_data_t *data = ((ring_data_t*)sqe->user_data) struct ring_data_t *data = ((ring_data_t*)sqe->user_data)
#define BS_SUBMIT_GET_ONLY_SQE(sqe) \ #define BS_SUBMIT_GET_ONLY_SQE(sqe) \
timespec now;\
clock_gettime(CLOCK_REALTIME, &now);\
printf("get_sqe %s %d %ld.%06ld\n", __FILE__, __LINE__, now.tv_sec, now.tv_nsec/1000);\
struct io_uring_sqe *sqe = get_sqe();\ struct io_uring_sqe *sqe = get_sqe();\
if (!sqe)\ if (!sqe)\
{\ {\
@ -67,6 +76,11 @@
} }
#define BS_SUBMIT_GET_SQE_DECL(sqe) \ #define BS_SUBMIT_GET_SQE_DECL(sqe) \
timespec now;\
clock_gettime(CLOCK_REALTIME, &now);\
printf("get_sqe %s %d %ld.%06ld\n", __FILE__, __LINE__, now.tv_sec, now.tv_nsec/1000);\
sqe = get_sqe();\ sqe = get_sqe();\
if (!sqe)\ if (!sqe)\
{\ {\
@ -77,8 +91,7 @@
#include "blockstore_journal.h" #include "blockstore_journal.h"
// 32 bytes = 24 bytes + block bitmap (4 bytes by default) + external attributes (also bitmap, 4 bytes by default) // 24 bytes + block bitmap per "clean" entry on disk with fixed metadata tables
// per "clean" entry on disk with fixed metadata tables
// FIXME: maybe add crc32's to metadata // FIXME: maybe add crc32's to metadata
struct __attribute__((__packed__)) clean_disk_entry struct __attribute__((__packed__)) clean_disk_entry
{ {
@ -94,7 +107,7 @@ struct __attribute__((__packed__)) clean_entry
uint64_t location; uint64_t location;
}; };
// 64 = 24 + 40 bytes per dirty entry in memory (obj_ver_id => dirty_entry) // 56 = 24 + 32 bytes per dirty entry in memory (obj_ver_id => dirty_entry)
struct __attribute__((__packed__)) dirty_entry struct __attribute__((__packed__)) dirty_entry
{ {
uint32_t state; uint32_t state;
@ -103,7 +116,6 @@ struct __attribute__((__packed__)) dirty_entry
uint32_t offset; // data offset within object (stripe) uint32_t offset; // data offset within object (stripe)
uint32_t len; // data length uint32_t len; // data length
uint64_t journal_sector; // journal sector used for this entry uint64_t journal_sector; // journal sector used for this entry
void* bitmap; // either external bitmap itself when it fits, or a pointer to it when it doesn't
}; };
// - Sync must be submitted after previous writes/deletes (not before!) // - Sync must be submitted after previous writes/deletes (not before!)
@ -156,12 +168,12 @@ struct blockstore_op_private_t
// Write // Write
struct iovec iov_zerofill[3]; struct iovec iov_zerofill[3];
// Warning: must not have a default value here because it's written to before calling constructor in blockstore_write.cpp O_o
uint64_t real_version;
// Sync // Sync
std::vector<obj_ver_id> sync_big_writes, sync_small_writes; std::vector<obj_ver_id> sync_big_writes, sync_small_writes;
int sync_small_checked, sync_big_checked; int sync_small_checked, sync_big_checked;
std::list<blockstore_op_t*>::iterator in_progress_ptr;
int prev_sync_count;
}; };
// //
@ -199,10 +211,7 @@ class blockstore_impl_t
// Suitable only for server SSDs with capacitors, requires disabled data and journal fsyncs // Suitable only for server SSDs with capacitors, requires disabled data and journal fsyncs
int immediate_commit = IMMEDIATE_NONE; int immediate_commit = IMMEDIATE_NONE;
bool inmemory_meta = false; bool inmemory_meta = false;
// Maximum flusher count int flusher_count;
unsigned flusher_count;
// Maximum queue depth
unsigned max_write_iodepth = 128;
/******* END OF OPTIONS *******/ /******* END OF OPTIONS *******/
struct ring_consumer_t ring_consumer; struct ring_consumer_t ring_consumer;
@ -210,8 +219,9 @@ class blockstore_impl_t
blockstore_clean_db_t clean_db; blockstore_clean_db_t clean_db;
uint8_t *clean_bitmap = NULL; uint8_t *clean_bitmap = NULL;
blockstore_dirty_db_t dirty_db; blockstore_dirty_db_t dirty_db;
std::vector<blockstore_op_t*> submit_queue; std::list<blockstore_op_t*> submit_queue; // FIXME: funny thing is that vector is better here
std::vector<obj_ver_id> unsynced_big_writes, unsynced_small_writes; std::vector<obj_ver_id> unsynced_big_writes, unsynced_small_writes;
std::list<blockstore_op_t*> in_progress_syncs; // ...and probably here, too
allocator *data_alloc = NULL; allocator *data_alloc = NULL;
uint8_t *zero_object; uint8_t *zero_object;
@ -228,10 +238,10 @@ class blockstore_impl_t
struct journal_t journal; struct journal_t journal;
journal_flusher_t *flusher; journal_flusher_t *flusher;
int write_iodepth = 0;
bool live = false, queue_stall = false; bool live = false, queue_stall = false;
ring_loop_t *ringloop; ring_loop_t *ringloop;
int inflight_writes = 0;
bool stop_sync_submitted; bool stop_sync_submitted;
@ -251,7 +261,6 @@ class blockstore_impl_t
void open_data(); void open_data();
void open_meta(); void open_meta();
void open_journal(); void open_journal();
uint8_t* get_clean_entry_bitmap(uint64_t block_loc, int offset);
// Asynchronous init // Asynchronous init
int initialized; int initialized;
@ -271,7 +280,6 @@ class blockstore_impl_t
// Write // Write
bool enqueue_write(blockstore_op_t *op); bool enqueue_write(blockstore_op_t *op);
void cancel_all_writes(blockstore_op_t *op, blockstore_dirty_db_t::iterator dirty_it, int retval);
int dequeue_write(blockstore_op_t *op); int dequeue_write(blockstore_op_t *op);
int dequeue_del(blockstore_op_t *op); int dequeue_del(blockstore_op_t *op);
int continue_write(blockstore_op_t *op); int continue_write(blockstore_op_t *op);
@ -279,21 +287,21 @@ class blockstore_impl_t
void handle_write_event(ring_data_t *data, blockstore_op_t *op); void handle_write_event(ring_data_t *data, blockstore_op_t *op);
// Sync // Sync
int continue_sync(blockstore_op_t *op, bool queue_has_in_progress_sync); int dequeue_sync(blockstore_op_t *op);
void handle_sync_event(ring_data_t *data, blockstore_op_t *op); void handle_sync_event(ring_data_t *data, blockstore_op_t *op);
void ack_sync(blockstore_op_t *op); int continue_sync(blockstore_op_t *op);
void ack_one_sync(blockstore_op_t *op);
int ack_sync(blockstore_op_t *op);
// Stabilize // Stabilize
int dequeue_stable(blockstore_op_t *op); int dequeue_stable(blockstore_op_t *op);
int continue_stable(blockstore_op_t *op); int continue_stable(blockstore_op_t *op);
void mark_stable(const obj_ver_id & ov);
void handle_stable_event(ring_data_t *data, blockstore_op_t *op); void handle_stable_event(ring_data_t *data, blockstore_op_t *op);
void stabilize_object(object_id oid, uint64_t max_ver); void stabilize_object(object_id oid, uint64_t max_ver);
// Rollback // Rollback
int dequeue_rollback(blockstore_op_t *op); int dequeue_rollback(blockstore_op_t *op);
int continue_rollback(blockstore_op_t *op); int continue_rollback(blockstore_op_t *op);
void mark_rolled_back(const obj_ver_id & ov);
void handle_rollback_event(ring_data_t *data, blockstore_op_t *op); void handle_rollback_event(ring_data_t *data, blockstore_op_t *op);
void erase_dirty(blockstore_dirty_db_t::iterator dirty_start, blockstore_dirty_db_t::iterator dirty_end, uint64_t clean_loc); void erase_dirty(blockstore_dirty_db_t::iterator dirty_start, blockstore_dirty_db_t::iterator dirty_end, uint64_t clean_loc);
@ -321,16 +329,13 @@ public:
bool is_stalled(); bool is_stalled();
// Submission // Submission
void enqueue_op(blockstore_op_t *op); void enqueue_op(blockstore_op_t *op, bool first = false);
// Unstable writes are added here (map of object_id -> version) // Unstable writes are added here (map of object_id -> version)
std::unordered_map<object_id, uint64_t> unstable_writes; std::unordered_map<object_id, uint64_t> unstable_writes;
// Space usage statistics
std::map<uint64_t, uint64_t> inode_space_stats;
inline uint32_t get_block_size() { return block_size; } inline uint32_t get_block_size() { return block_size; }
inline uint64_t get_block_count() { return block_count; } inline uint64_t get_block_count() { return block_count; }
inline uint64_t get_free_block_count() { return data_alloc->get_free_count(); } inline uint64_t get_free_block_count() { return data_alloc->get_free_count(); }
inline uint32_t get_bitmap_granularity() { return disk_alignment; } inline uint32_t get_disk_alignment() { return disk_alignment; }
}; };

View File

@ -1,6 +1,3 @@
// Copyright (c) Vitaliy Filippov, 2019+
// License: VNPL-1.1 (see for details)
#include "blockstore_impl.h" #include "blockstore_impl.h"
blockstore_init_meta::blockstore_init_meta(blockstore_impl_t *bs) blockstore_init_meta::blockstore_init_meta(blockstore_impl_t *bs)
@ -100,7 +97,7 @@ void blockstore_init_meta::handle_entries(void* entries, unsigned count, int blo
clean_disk_entry *entry = (clean_disk_entry*)(entries + i*bs->clean_entry_size); clean_disk_entry *entry = (clean_disk_entry*)(entries + i*bs->clean_entry_size);
if (!bs->inmemory_meta && bs->clean_entry_bitmap_size) if (!bs->inmemory_meta && bs->clean_entry_bitmap_size)
{ {
memcpy(bs->clean_bitmap + (done_cnt+i)*2*bs->clean_entry_bitmap_size, &entry->bitmap, 2*bs->clean_entry_bitmap_size); memcpy(bs->clean_bitmap + (done_cnt+i)*bs->clean_entry_bitmap_size, &entry->bitmap, bs->clean_entry_bitmap_size);
} }
if (entry->oid.inode > 0) if (entry->oid.inode > 0)
{ {
@ -111,17 +108,13 @@ void blockstore_init_meta::handle_entries(void* entries, unsigned count, int blo
{ {
// free the previous block // free the previous block
printf("Free block %lu (new location is %lu)\n", clean_it->second.location >> block_order, done_cnt+i); printf("Free block %lu\n", clean_it->second.location >> bs->block_order);
#endif #endif
bs->data_alloc->set(clean_it->second.location >> block_order, false); bs->data_alloc->set(clean_it->second.location >> block_order, false);
} }
bs->inode_space_stats[entry->oid.inode] += bs->block_size;
entries_loaded++; entries_loaded++;
printf("Allocate block (clean entry) %lu: %lx:%lx v%lu\n", done_cnt+i, entry->oid.inode, entry->oid.stripe, entry->version); printf("Allocate block (clean entry) %lu: %lu:%lu v%lu\n", done_cnt+i, entry->oid.inode, entry->oid.stripe, entry->version);
#endif #endif
bs->data_alloc->set(done_cnt+i, true); bs->data_alloc->set(done_cnt+i, true);
bs->clean_db[entry->oid] = (struct clean_entry){ bs->clean_db[entry->oid] = (struct clean_entry){
@ -132,7 +125,7 @@ void blockstore_init_meta::handle_entries(void* entries, unsigned count, int blo
else else
{ {
printf("Old clean entry %lu: %lx:%lx v%lu\n", done_cnt+i, entry->oid.inode, entry->oid.stripe, entry->version); printf("Old clean entry %lu: %lu:%lu v%lu\n", done_cnt+i, entry->oid.inode, entry->oid.stripe, entry->version);
#endif #endif
} }
} }
@ -209,7 +202,11 @@ int blockstore_init_journal::loop()
goto resume_7; goto resume_7;
printf("Reading blockstore journal\n"); printf("Reading blockstore journal\n");
if (!bs->journal.inmemory) if (!bs->journal.inmemory)
submitted_buf = memalign_or_die(MEM_ALIGNMENT, 2*bs->journal.block_size); {
submitted_buf = memalign(MEM_ALIGNMENT, 2*bs->journal.block_size);
if (!submitted_buf)
throw std::bad_alloc();
else else
submitted_buf = bs->journal.buffer; submitted_buf = bs->journal.buffer;
// Read first block of the journal // Read first block of the journal
@ -320,7 +317,7 @@ resume_1:
if (journal_pos < bs->journal.used_start) if (journal_pos < bs->journal.used_start)
end = bs->journal.used_start; end = bs->journal.used_start;
if (!bs->journal.inmemory) if (!bs->journal.inmemory)
submitted_buf = memalign_or_die(MEM_ALIGNMENT, JOURNAL_BUFFER_SIZE); submitted_buf = memalign(MEM_ALIGNMENT, JOURNAL_BUFFER_SIZE);
else else
submitted_buf = bs->journal.buffer + journal_pos; submitted_buf = bs->journal.buffer + journal_pos;
data->iov = { data->iov = {
@ -403,10 +400,11 @@ resume_1:
} }
} }
} }
bs->flusher->mark_trim_possible(); // Trim journal on start so we don't stall when all entries are older
bs->journal.dirty_start = bs->journal.next_free; bs->journal.dirty_start = bs->journal.next_free;
printf( printf(
"Journal entries loaded: %lu, free journal space: %lu bytes (%08lx..%08lx is used), free blocks: %lu / %lu\n", "Journal entries loaded: %lu, free journal space: %lu bytes (%lu..%lu is used), free blocks: %lu / %lu\n",
entries_loaded, entries_loaded,
(bs->journal.next_free >= bs->journal.used_start (bs->journal.next_free >= bs->journal.used_start
? bs->journal.len-bs->journal.block_size - (bs->journal.next_free-bs->journal.used_start) ? bs->journal.len-bs->journal.block_size - (bs->journal.next_free-bs->journal.used_start)
@ -456,15 +454,10 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
break; break;
} }
} }
if (je->type == JE_SMALL_WRITE || je->type == JE_SMALL_WRITE_INSTANT) if (je->type == JE_SMALL_WRITE)
{ {
printf( printf("je_small_write oid=%lu:%lu ver=%lu offset=%u len=%u\n", je->small_write.oid.inode, je->small_write.oid.stripe, je->small_write.version, je->small_write.offset, je->small_write.len);
"je_small_write%s oid=%lx:%lx ver=%lu offset=%u len=%u\n",
je->type == JE_SMALL_WRITE_INSTANT ? "_instant" : "",
je->small_write.oid.inode, je->small_write.oid.stripe, je->small_write.version,
je->small_write.offset, je->small_write.len
#endif #endif
// oid, version, offset, len // oid, version, offset, len
uint64_t prev_free = next_free; uint64_t prev_free = next_free;
@ -482,7 +475,7 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
if (location != je->small_write.data_offset) if (location != je->small_write.data_offset)
{ {
char err[1024]; char err[1024];
snprintf(err, 1024, "BUG: calculated journal data offset (%08lx) != stored journal data offset (%08lx)", location, je->small_write.data_offset); snprintf(err, 1024, "BUG: calculated journal data offset (%lu) != stored journal data offset (%lu)", location, je->small_write.data_offset);
throw std::runtime_error(err); throw std::runtime_error(err);
} }
uint32_t data_crc32 = 0; uint32_t data_crc32 = 0;
@ -534,99 +527,27 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
.oid = je->small_write.oid, .oid = je->small_write.oid,
.version = je->small_write.version, .version = je->small_write.version,
}; };
void *bmp = (void*)je + sizeof(journal_entry_small_write);
if (bs->clean_entry_bitmap_size <= sizeof(void*))
memcpy(&bmp, bmp, bs->clean_entry_bitmap_size);
else if (!bs->journal.inmemory)
// FIXME Using large blockstore objects and not keeping journal in memory
// will result in a lot of small allocations for entry bitmaps. This can
// only be fixed by using a patched map with dynamic entry size, but not
// the btree_map, because it doesn't keep iterators valid all the time.
void *bmp_cp = malloc_or_die(bs->clean_entry_bitmap_size);
memcpy(bmp_cp, bmp, bs->clean_entry_bitmap_size);
bmp = bmp_cp;
bs->dirty_db.emplace(ov, (dirty_entry){ bs->dirty_db.emplace(ov, (dirty_entry){
.state = (BS_ST_SMALL_WRITE | BS_ST_SYNCED), .state = ST_J_SYNCED,
.flags = 0, .flags = 0,
.location = location, .location = location,
.offset = je->small_write.offset, .offset = je->small_write.offset,
.len = je->small_write.len, .len = je->small_write.len,
.journal_sector = proc_pos, .journal_sector = proc_pos,
.bitmap = bmp,
}); });
bs->journal.used_sectors[proc_pos]++; bs->journal.used_sectors[proc_pos]++;
printf( printf("journal offset %lu is used by %lu:%lu v%lu\n", proc_pos, ov.oid.inode, ov.oid.stripe, ov.version);
"journal offset %08lx is used by %lx:%lx v%lu (%lu refs)\n",
proc_pos, ov.oid.inode, ov.oid.stripe, ov.version, bs->journal.used_sectors[proc_pos]
#endif #endif
auto & unstab = bs->unstable_writes[ov.oid]; auto & unstab = bs->unstable_writes[ov.oid];
unstab = unstab < ov.version ? ov.version : unstab; unstab = unstab < ov.version ? ov.version : unstab;
if (je->type == JE_SMALL_WRITE_INSTANT)
} }
} }
else if (je->type == JE_BIG_WRITE || je->type == JE_BIG_WRITE_INSTANT) else if (je->type == JE_BIG_WRITE)
{ {
printf( printf("je_big_write oid=%lu:%lu ver=%lu loc=%lu\n", je->big_write.oid.inode, je->big_write.oid.stripe, je->big_write.version, je->big_write.location);
"je_big_write%s oid=%lx:%lx ver=%lu loc=%lu\n",
je->type == JE_BIG_WRITE_INSTANT ? "_instant" : "",
je->big_write.oid.inode, je->big_write.oid.stripe, je->big_write.version, je->big_write.location >> bs->block_order
#endif #endif
auto dirty_it = bs->dirty_db.upper_bound((obj_ver_id){
.oid = je->big_write.oid,
.version = UINT64_MAX,
if (dirty_it != bs->dirty_db.begin() && bs->dirty_db.size() > 0)
if (dirty_it->first.oid == je->big_write.oid &&
dirty_it->first.version >= je->big_write.version &&
(dirty_it->second.state & BS_ST_TYPE_MASK) == BS_ST_DELETE)
// It is allowed to overwrite a deleted object with a
// version number smaller than deletion version number,
// because the presence of a BIG_WRITE entry means that
// its data and metadata are already flushed.
// We don't know if newer versions are flushed, but
// the previous delete definitely is.
// So we flush previous dirty entries, but retain the clean one.
// This feature is required for writes happening shortly
// after deletes.
auto dirty_end = dirty_it;
while (1)
if (dirty_it == bs->dirty_db.begin())
if (dirty_it->first.oid != je->big_write.oid)
auto clean_it = bs->clean_db.find(je->big_write.oid);
dirty_it, dirty_end,
clean_it != bs->clean_db.end() ? clean_it->second.location : UINT64_MAX
// Remove it from the flusher's queue, too
// Otherwise it may end up referring to a small unstable write after reading the rest of the journal
auto clean_it = bs->clean_db.find(je->big_write.oid); auto clean_it = bs->clean_db.find(je->big_write.oid);
if (clean_it == bs->clean_db.end() || if (clean_it == bs->clean_db.end() ||
clean_it->second.version < je->big_write.version) clean_it->second.version < je->big_write.version)
@ -636,100 +557,131 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
.oid = je->big_write.oid, .oid = je->big_write.oid,
.version = je->big_write.version, .version = je->big_write.version,
}; };
void *bmp = (void*)je + sizeof(journal_entry_big_write);
if (bs->clean_entry_bitmap_size <= sizeof(void*))
memcpy(&bmp, bmp, bs->clean_entry_bitmap_size);
else if (!bs->journal.inmemory)
// FIXME Using large blockstore objects and not keeping journal in memory
// will result in a lot of small allocations for entry bitmaps. This can
// only be fixed by using a patched map with dynamic entry size, but not
// the btree_map, because it doesn't keep iterators valid all the time.
void *bmp_cp = malloc_or_die(bs->clean_entry_bitmap_size);
memcpy(bmp_cp, bmp, bs->clean_entry_bitmap_size);
bmp = bmp_cp;
bs->dirty_db.emplace(ov, (dirty_entry){ bs->dirty_db.emplace(ov, (dirty_entry){
.state = (BS_ST_BIG_WRITE | BS_ST_SYNCED), .state = ST_D_SYNCED,
.flags = 0, .flags = 0,
.location = je->big_write.location, .location = je->big_write.location,
.offset = je->big_write.offset, .offset = je->big_write.offset,
.len = je->big_write.len, .len = je->big_write.len,
.journal_sector = proc_pos, .journal_sector = proc_pos,
.bitmap = bmp,
}); });
printf("Allocate block %lu\n", je->big_write.location >> bs->block_order); printf("Allocate block %lu\n", je->big_write.location >> bs->block_order);
#endif #endif
bs->data_alloc->set(je->big_write.location >> bs->block_order, true); bs->data_alloc->set(je->big_write.location >> bs->block_order, true);
bs->journal.used_sectors[proc_pos]++; bs->journal.used_sectors[proc_pos]++;
"journal offset %08lx is used by %lx:%lx v%lu (%lu refs)\n",
proc_pos, ov.oid.inode, ov.oid.stripe, ov.version, bs->journal.used_sectors[proc_pos]
auto & unstab = bs->unstable_writes[ov.oid]; auto & unstab = bs->unstable_writes[ov.oid];
unstab = unstab < ov.version ? ov.version : unstab; unstab = unstab < ov.version ? ov.version : unstab;
if (je->type == JE_BIG_WRITE_INSTANT)
} }
} }
else if (je->type == JE_STABLE) else if (je->type == JE_STABLE)
{ {
printf("je_stable oid=%lx:%lx ver=%lu\n", je->stable.oid.inode, je->stable.oid.stripe, je->stable.version); printf("je_stable oid=%lu:%lu ver=%lu\n", je->stable.oid.inode, je->stable.oid.stripe, je->stable.version);
#endif #endif
// oid, version // oid, version
obj_ver_id ov = { obj_ver_id ov = {
.oid = je->stable.oid, .oid = je->stable.oid,
.version = je->stable.version, .version = je->stable.version,
}; };
bs->mark_stable(ov); auto it = bs->dirty_db.find(ov);
if (it == bs->dirty_db.end())
// journal contains a legitimate STABLE entry for a non-existing dirty write
// this probably means that journal was trimmed between WRITE and STABLE entries
// skip it
while (1)
it->second.state = (it->second.state == ST_D_SYNCED
: (it->second.state == ST_DEL_SYNCED ? ST_DEL_STABLE : ST_J_STABLE));
if (it == bs->dirty_db.begin())
if (it->first.oid != ov.oid || IS_STABLE(it->second.state))
auto unstab_it = bs->unstable_writes.find(ov.oid);
if (unstab_it != bs->unstable_writes.end() && unstab_it->second <= ov.version)
} }
else if (je->type == JE_ROLLBACK) else if (je->type == JE_ROLLBACK)
{ {
printf("je_rollback oid=%lx:%lx ver=%lu\n", je->rollback.oid.inode, je->rollback.oid.stripe, je->rollback.version); printf("je_rollback oid=%lu:%lu ver=%lu\n", je->rollback.oid.inode, je->rollback.oid.stripe, je->rollback.version);
#endif #endif
// rollback dirty writes of <oid> up to <version> // rollback dirty writes of <oid> up to <version>
obj_ver_id ov = { auto it = bs->dirty_db.lower_bound((obj_ver_id){
.oid = je->rollback.oid, .oid = je->rollback.oid,
.version = je->rollback.version, .version = UINT64_MAX,
}; });
bs->mark_rolled_back(ov); if (it != bs->dirty_db.begin())
uint64_t max_unstable = 0;
auto rm_start = it;
auto rm_end = it;
while (it->first.oid == je->rollback.oid &&
it->first.version > je->rollback.version &&
!IS_IN_FLIGHT(it->second.state) &&
if (it->first.oid != je->rollback.oid)
else if (it->first.version <= je->rollback.version)
if (!IS_STABLE(it->second.state))
max_unstable = it->first.version;
else if (IS_STABLE(it->second.state))
// Remove entry
rm_start = it;
if (it == bs->dirty_db.begin())
if (rm_start != rm_end)
bs->erase_dirty(rm_start, rm_end, UINT64_MAX);
auto unstab_it = bs->unstable_writes.find(je->rollback.oid);
if (unstab_it != bs->unstable_writes.end())
if (max_unstable == 0)
unstab_it->second = max_unstable;
} }
else if (je->type == JE_DELETE) else if (je->type == JE_DELETE)
{ {
printf("je_delete oid=%lx:%lx ver=%lu\n", je->del.oid.inode, je->del.oid.stripe, je->del.version); printf("je_delete oid=%lu:%lu ver=%lu\n", je->del.oid.inode, je->del.oid.stripe, je->del.version);
#endif #endif
auto clean_it = bs->clean_db.find(je->del.oid); // oid, version
if (clean_it != bs->clean_db.end() && obj_ver_id ov = {
clean_it->second.version < je->del.version) .oid = je->del.oid,
{ .version = je->del.version,
// oid, version };
obj_ver_id ov = { bs->dirty_db.emplace(ov, (dirty_entry){
.oid = je->del.oid, .state = ST_DEL_SYNCED,
.version = je->del.version, .flags = 0,
}; .location = 0,
bs->dirty_db.emplace(ov, (dirty_entry){ .offset = 0,
.state = (BS_ST_DELETE | BS_ST_SYNCED), .len = 0,
.flags = 0, .journal_sector = proc_pos,
.location = 0, });
.offset = 0, bs->journal.used_sectors[proc_pos]++;
.len = 0,
.journal_sector = proc_pos,
// Deletions are treated as immediately stable, because
// "2-phase commit" (write->stabilize) isn't sufficient for them anyway
} }
started = true; started = true;
pos += je->size; pos += je->size;

View File

@ -1,6 +1,3 @@
// Copyright (c) Vitaliy Filippov, 2019+
// License: VNPL-1.1 (see for details)
#pragma once #pragma once
class blockstore_init_meta class blockstore_init_meta

View File

@ -1,12 +1,9 @@
// Copyright (c) Vitaliy Filippov, 2019+
// License: VNPL-1.1 (see for details)
#include "blockstore_impl.h" #include "blockstore_impl.h"
blockstore_journal_check_t::blockstore_journal_check_t(blockstore_impl_t *bs) blockstore_journal_check_t::blockstore_journal_check_t(blockstore_impl_t *bs)
{ {
this->bs = bs; this->bs = bs;
sectors_to_write = 0; sectors_required = 0;
next_pos = bs->journal.next_free; next_pos = bs->journal.next_free;
next_sector = bs->journal.cur_sector; next_sector = bs->journal.cur_sector;
first_sector = -1; first_sector = -1;
@ -20,26 +17,21 @@ int blockstore_journal_check_t::check_available(blockstore_op_t *op, int entries
int required = entries_required; int required = entries_required;
while (1) while (1)
{ {
int fits = bs->journal.no_same_sector_overwrites && next_pos == bs->journal.next_free && bs->journal.sector_info[next_sector].written int fits = (bs->journal.block_size - next_in_pos) / size;
? 0
: (bs->journal.block_size - next_in_pos) / size;
if (fits > 0) if (fits > 0)
{ {
if (fits > required)
fits = required;
if (first_sector == -1) if (first_sector == -1)
{ {
first_sector = next_sector; first_sector = next_sector;
} }
required -= fits; required -= fits;
next_in_pos += fits * size; next_in_pos += fits * size;
sectors_to_write++; sectors_required++;
} }
else if (bs->journal.sector_info[next_sector].dirty) else if (bs->journal.sector_info[next_sector].dirty)
{ {
sectors_to_write++; // sectors_required is more like "sectors to write"
} }
if (required <= 0) if (required <= 0)
{ {
@ -62,7 +54,7 @@ int blockstore_journal_check_t::check_available(blockstore_op_t *op, int entries
" is too small for a batch of "+std::to_string(entries_required)+" entries of "+std::to_string(size)+" bytes" " is too small for a batch of "+std::to_string(entries_required)+" entries of "+std::to_string(size)+" bytes"
); );
} }
if (bs->journal.sector_info[next_sector].flush_count > 0 || if (bs->journal.sector_info[next_sector].usage_count > 0 ||
bs->journal.sector_info[next_sector].dirty) bs->journal.sector_info[next_sector].dirty)
{ {
// No memory buffer available. Wait for it. // No memory buffer available. Wait for it.
@ -74,18 +66,17 @@ int blockstore_journal_check_t::check_available(blockstore_op_t *op, int entries
dirty++; dirty++;
used++; used++;
} }
if (bs->journal.sector_info[i].flush_count > 0) if (bs->journal.sector_info[i].usage_count > 0)
{ {
used++; used++;
} }
} }
// In fact, it's even more rare than "ran out of journal space", so print a warning // In fact, it's even more rare than "ran out of journal space", so print a warning
printf( printf(
"Ran out of journal sector buffers: %d/%lu buffers used (%d dirty), next buffer (%ld)" "Ran out of journal sector buffers: %d/%lu buffers used (%d dirty), next buffer (%ld) is %s and flushed %lu times\n",
" is %s and flushed %lu times. Consider increasing \'journal_sector_buffer_count\'\n",
used, bs->journal.sector_count, dirty, next_sector, used, bs->journal.sector_count, dirty, next_sector,
bs->journal.sector_info[next_sector].dirty ? "dirty" : "not dirty", bs->journal.sector_info[next_sector].dirty ? "dirty" : "not dirty",
bs->journal.sector_info[next_sector].flush_count bs->journal.sector_info[next_sector].usage_count
); );
return 0; return 0;
@ -104,11 +95,13 @@ int blockstore_journal_check_t::check_available(blockstore_op_t *op, int entries
{ {
// No space in the journal. Wait until used_start changes. // No space in the journal. Wait until used_start changes.
printf( printf(
"Ran out of journal space (used_start=%08lx, next_free=%08lx, dirty_start=%08lx)\n", "Ran out of journal space (free space: %lu bytes)\n",
bs->journal.used_start, bs->journal.next_free, bs->journal.dirty_start (bs->journal.next_free >= bs->journal.used_start
? bs->journal.len-bs->journal.block_size - (bs->journal.next_free-bs->journal.used_start)
: bs->journal.used_start - bs->journal.next_free)
); );
PRIV(op)->wait_for = WAIT_JOURNAL; PRIV(op)->wait_for = WAIT_JOURNAL;
bs->flusher->request_trim(); bs->flusher->force_start();
PRIV(op)->wait_detail = bs->journal.used_start; PRIV(op)->wait_detail = bs->journal.used_start;
return 0; return 0;
} }
@ -117,21 +110,20 @@ int blockstore_journal_check_t::check_available(blockstore_op_t *op, int entries
journal_entry* prefill_single_journal_entry(journal_t & journal, uint16_t type, uint32_t size) journal_entry* prefill_single_journal_entry(journal_t & journal, uint16_t type, uint32_t size)
{ {
if (!journal.entry_fits(size)) if (journal.block_size - journal.in_sector_pos < size)
{ {
assert(!journal.sector_info[journal.cur_sector].dirty); assert(!journal.sector_info[journal.cur_sector].dirty);
// Move to the next journal sector // Move to the next journal sector
if (journal.sector_info[journal.cur_sector].flush_count > 0) if (journal.sector_info[journal.cur_sector].usage_count > 0)
{ {
// Also select next sector buffer in memory // Also select next sector buffer in memory
journal.cur_sector = ((journal.cur_sector + 1) % journal.sector_count); journal.cur_sector = ((journal.cur_sector + 1) % journal.sector_count);
assert(!journal.sector_info[journal.cur_sector].flush_count); assert(!journal.sector_info[journal.cur_sector].usage_count);
} }
else else
{ {
journal.dirty_start = journal.next_free; journal.dirty_start = journal.next_free;
} }
journal.sector_info[journal.cur_sector].written = false;
journal.sector_info[journal.cur_sector].offset = journal.next_free; journal.sector_info[journal.cur_sector].offset = journal.next_free;
journal.in_sector_pos = 0; journal.in_sector_pos = 0;
journal.next_free = (journal.next_free+journal.block_size) < journal.len ? journal.next_free + journal.block_size : journal.block_size; journal.next_free = (journal.next_free+journal.block_size) < journal.len ? journal.next_free + journal.block_size : journal.block_size;
@ -156,8 +148,7 @@ journal_entry* prefill_single_journal_entry(journal_t & journal, uint16_t type,
void prepare_journal_sector_write(journal_t & journal, int cur_sector, io_uring_sqe *sqe, std::function<void(ring_data_t*)> cb) void prepare_journal_sector_write(journal_t & journal, int cur_sector, io_uring_sqe *sqe, std::function<void(ring_data_t*)> cb)
{ {
journal.sector_info[cur_sector].dirty = false; journal.sector_info[cur_sector].dirty = false;
journal.sector_info[cur_sector].written = true; journal.sector_info[cur_sector].usage_count++;
ring_data_t *data = ((ring_data_t*)sqe->user_data); ring_data_t *data = ((ring_data_t*)sqe->user_data);
data->iov = (struct iovec){ data->iov = (struct iovec){
(journal.inmemory (journal.inmemory
@ -184,13 +175,13 @@ journal_t::~journal_t()
buffer = NULL; buffer = NULL;
} }
uint64_t journal_t::get_trim_pos() bool journal_t::trim()
{ {
auto journal_used_it = used_sectors.lower_bound(used_start); auto journal_used_it = used_sectors.lower_bound(used_start);
printf( printf(
"Trimming journal (used_start=%08lx, next_free=%08lx, dirty_start=%08lx, new_start=%08lx, new_refcount=%ld)\n", "Trimming journal (used_start=%08lx, next_free=%08lx, first_used=%08lx, usage_count=%08lx)\n",
used_start, next_free, dirty_start, used_start, next_free,
journal_used_it == used_sectors.end() ? 0 : journal_used_it->first, journal_used_it == used_sectors.end() ? 0 : journal_used_it->first,
journal_used_it == used_sectors.end() ? 0 : journal_used_it->second journal_used_it == used_sectors.end() ? 0 : journal_used_it->second
); );
@ -202,19 +193,26 @@ uint64_t journal_t::get_trim_pos()
if (journal_used_it == used_sectors.end()) if (journal_used_it == used_sectors.end())
{ {
// Journal is empty // Journal is empty
return next_free; used_start = next_free;
} }
else else
{ {
// next_free does not need updating during trim used_start = journal_used_it->first;
return journal_used_it->first; // next_free does not need updating here
} }
} }
else if (journal_used_it->first > used_start) else if (journal_used_it->first > used_start)
{ {
// Journal is cleared up to <journal_used_it> // Journal is cleared up to <journal_used_it>
return journal_used_it->first; used_start = journal_used_it->first;
} }
// Can't trim journal else
return used_start; {
// Can't trim journal
return false;
printf("Journal trimmed to %08lx (next_free=%08lx)\n", used_start, next_free);
return true;
} }

View File

@ -1,6 +1,3 @@
// Copyright (c) Vitaliy Filippov, 2019+
// License: VNPL-1.1 (see for details)
#pragma once #pragma once
#include "crc32c.h" #include "crc32c.h"
@ -10,8 +7,6 @@
#define JOURNAL_BUFFER_SIZE 4*1024*1024 #define JOURNAL_BUFFER_SIZE 4*1024*1024
// We reserve some extra space for future stabilize requests during writes // We reserve some extra space for future stabilize requests during writes
// FIXME: This value should be dynamic i.e. Blockstore ideally shouldn't allow
// writing more than can be stabilized afterwards
// Journal entries // Journal entries
@ -24,9 +19,7 @@
#define JE_STABLE 0x04 #define JE_STABLE 0x04
#define JE_DELETE 0x05 #define JE_DELETE 0x05
#define JE_ROLLBACK 0x06 #define JE_ROLLBACK 0x06
#define JE_SMALL_WRITE_INSTANT 0x07 #define JE_MAX 0x06
#define JE_MAX 0x08
// crc32c comes first to ease calculation and is equal to crc32() // crc32c comes first to ease calculation and is equal to crc32()
struct __attribute__((__packed__)) journal_entry_start struct __attribute__((__packed__)) journal_entry_start
@ -54,9 +47,6 @@ struct __attribute__((__packed__)) journal_entry_small_write
// data_offset is its offset within journal // data_offset is its offset within journal
uint64_t data_offset; uint64_t data_offset;
uint32_t crc32_data; uint32_t crc32_data;
// small_write and big_write entries are followed by the "external" bitmap
// its size is dynamic and included in journal entry's <size> field
uint8_t bitmap[];
}; };
struct __attribute__((__packed__)) journal_entry_big_write struct __attribute__((__packed__)) journal_entry_big_write
@ -71,9 +61,6 @@ struct __attribute__((__packed__)) journal_entry_big_write
uint32_t offset; uint32_t offset;
uint32_t len; uint32_t len;
uint64_t location; uint64_t location;
// small_write and big_write entries are followed by the "external" bitmap
// its size is dynamic and included in journal entry's <size> field
uint8_t bitmap[];
}; };
struct __attribute__((__packed__)) journal_entry_stable struct __attribute__((__packed__)) journal_entry_stable
@ -139,8 +126,7 @@ inline uint32_t je_crc32(journal_entry *je)
struct journal_sector_info_t struct journal_sector_info_t
{ {
uint64_t offset; uint64_t offset;
uint64_t flush_count; uint64_t usage_count;
bool written;
bool dirty; bool dirty;
}; };
@ -165,7 +151,6 @@ struct journal_t
void *sector_buf = NULL; void *sector_buf = NULL;
journal_sector_info_t *sector_info = NULL; journal_sector_info_t *sector_info = NULL;
uint64_t sector_count; uint64_t sector_count;
bool no_same_sector_overwrites = false;
int cur_sector = 0; int cur_sector = 0;
int in_sector_pos = 0; int in_sector_pos = 0;
@ -175,19 +160,13 @@ struct journal_t
~journal_t(); ~journal_t();
bool trim(); bool trim();
uint64_t get_trim_pos();
inline bool entry_fits(int size)
return !(block_size - in_sector_pos < size ||
no_same_sector_overwrites && sector_info[cur_sector].written);
}; };
struct blockstore_journal_check_t struct blockstore_journal_check_t
{ {
blockstore_impl_t *bs; blockstore_impl_t *bs;
uint64_t next_pos, next_sector, next_in_pos; uint64_t next_pos, next_sector, next_in_pos;
int sectors_to_write, first_sector; int sectors_required, first_sector;
bool right_dir; // writing to the end or the beginning of the ring buffer bool right_dir; // writing to the end or the beginning of the ring buffer
blockstore_journal_check_t(blockstore_impl_t *bs); blockstore_journal_check_t(blockstore_impl_t *bs);

View File

@ -1,6 +1,3 @@
// Copyright (c) Vitaliy Filippov, 2019+
// License: VNPL-1.1 (see for details)
#include <sys/file.h> #include <sys/file.h>
#include "blockstore_impl.h" #include "blockstore_impl.h"
@ -62,15 +59,12 @@ void blockstore_impl_t::parse_config(blockstore_config_t & config)
journal_device = config["journal_device"]; journal_device = config["journal_device"];
journal.offset = strtoull(config["journal_offset"].c_str(), NULL, 10); journal.offset = strtoull(config["journal_offset"].c_str(), NULL, 10);
journal.sector_count = strtoull(config["journal_sector_buffer_count"].c_str(), NULL, 10); journal.sector_count = strtoull(config["journal_sector_buffer_count"].c_str(), NULL, 10);
journal.no_same_sector_overwrites = config["journal_no_same_sector_overwrites"] == "true" ||
config["journal_no_same_sector_overwrites"] == "1" || config["journal_no_same_sector_overwrites"] == "yes";
journal.inmemory = config["inmemory_journal"] != "false"; journal.inmemory = config["inmemory_journal"] != "false";
disk_alignment = strtoull(config["disk_alignment"].c_str(), NULL, 10); disk_alignment = strtoull(config["disk_alignment"].c_str(), NULL, 10);
journal_block_size = strtoull(config["journal_block_size"].c_str(), NULL, 10); journal_block_size = strtoull(config["journal_block_size"].c_str(), NULL, 10);
meta_block_size = strtoull(config["meta_block_size"].c_str(), NULL, 10); meta_block_size = strtoull(config["meta_block_size"].c_str(), NULL, 10);
bitmap_granularity = strtoull(config["bitmap_granularity"].c_str(), NULL, 10); bitmap_granularity = strtoull(config["bitmap_granularity"].c_str(), NULL, 10);
flusher_count = strtoull(config["flusher_count"].c_str(), NULL, 10); flusher_count = strtoull(config["flusher_count"].c_str(), NULL, 10);
max_write_iodepth = strtoull(config["max_write_iodepth"].c_str(), NULL, 10);
// Validate // Validate
if (!block_size) if (!block_size)
{ {
@ -84,17 +78,13 @@ void blockstore_impl_t::parse_config(blockstore_config_t & config)
{ {
flusher_count = 32; flusher_count = 32;
} }
if (!max_write_iodepth)
max_write_iodepth = 128;
if (!disk_alignment) if (!disk_alignment)
{ {
disk_alignment = 4096; disk_alignment = 4096;
} }
else if (disk_alignment % MEM_ALIGNMENT) else if (disk_alignment % MEM_ALIGNMENT)
{ {
throw std::runtime_error("disk_alignment must be a multiple of "+std::to_string(MEM_ALIGNMENT)); throw std::runtime_error("disk_alingment must be a multiple of "+std::to_string(MEM_ALIGNMENT));
} }
if (!journal_block_size) if (!journal_block_size)
{ {
@ -118,7 +108,7 @@ void blockstore_impl_t::parse_config(blockstore_config_t & config)
} }
if (!bitmap_granularity) if (!bitmap_granularity)
{ {
bitmap_granularity = DEFAULT_BITMAP_GRANULARITY; bitmap_granularity = 4096;
} }
else if (bitmap_granularity % disk_alignment) else if (bitmap_granularity % disk_alignment)
{ {
@ -170,7 +160,7 @@ void blockstore_impl_t::parse_config(blockstore_config_t & config)
} }
// init some fields // init some fields
clean_entry_bitmap_size = block_size / bitmap_granularity / 8; clean_entry_bitmap_size = block_size / bitmap_granularity / 8;
clean_entry_size = sizeof(clean_disk_entry) + 2*clean_entry_bitmap_size; clean_entry_size = sizeof(clean_disk_entry) + clean_entry_bitmap_size;
journal.block_size = journal_block_size; journal.block_size = journal_block_size;
journal.next_free = journal_block_size; journal.next_free = journal_block_size;
journal.used_start = journal_block_size; journal.used_start = journal_block_size;
@ -237,7 +227,7 @@ void blockstore_impl_t::calc_lengths()
} }
else if (clean_entry_bitmap_size) else if (clean_entry_bitmap_size)
{ {
clean_bitmap = (uint8_t*)malloc(block_count * 2*clean_entry_bitmap_size); clean_bitmap = (uint8_t*)malloc(block_count * clean_entry_bitmap_size);
if (!clean_bitmap) if (!clean_bitmap)
throw std::runtime_error("Failed to allocate memory for the metadata sparse write bitmap"); throw std::runtime_error("Failed to allocate memory for the metadata sparse write bitmap");
} }

View File

@ -1,6 +1,3 @@
// Copyright (c) Vitaliy Filippov, 2019+
// License: VNPL-1.1 (see for details)
#include "blockstore_impl.h" #include "blockstore_impl.h"
int blockstore_impl_t::fulfill_read_push(blockstore_op_t *op, void *buf, uint64_t offset, uint64_t len, int blockstore_impl_t::fulfill_read_push(blockstore_op_t *op, void *buf, uint64_t offset, uint64_t len,
@ -40,7 +37,6 @@ int blockstore_impl_t::fulfill_read_push(blockstore_op_t *op, void *buf, uint64_
return 1; return 1;
} }
// FIXME I've seen a bug here so I want some tests
int blockstore_impl_t::fulfill_read(blockstore_op_t *read_op, uint64_t &fulfilled, uint32_t item_start, uint32_t item_end, int blockstore_impl_t::fulfill_read(blockstore_op_t *read_op, uint64_t &fulfilled, uint32_t item_start, uint32_t item_end,
uint32_t item_state, uint64_t item_version, uint64_t item_location) uint32_t item_state, uint64_t item_version, uint64_t item_location)
{ {
@ -53,20 +49,8 @@ int blockstore_impl_t::fulfill_read(blockstore_op_t *read_op, uint64_t &fulfille
while (1) while (1)
{ {
for (; it != PRIV(read_op)->read_vec.end(); it++) for (; it != PRIV(read_op)->read_vec.end(); it++)
if (it->offset >= cur_start) if (it->offset >= cur_start)
break; break;
else if (it->offset + it->len > cur_start)
cur_start = it->offset + it->len;
if (cur_start >= item_end)
goto endwhile;
if (it == PRIV(read_op)->read_vec.end() || it->offset > cur_start) if (it == PRIV(read_op)->read_vec.end() || it->offset > cur_start)
{ {
fulfill_read_t el = { fulfill_read_t el = {
@ -85,30 +69,12 @@ int blockstore_impl_t::fulfill_read(blockstore_op_t *read_op, uint64_t &fulfille
} }
cur_start = it->offset + it->len; cur_start = it->offset + it->len;
if (it == PRIV(read_op)->read_vec.end() || cur_start >= item_end) if (it == PRIV(read_op)->read_vec.end() || cur_start >= item_end)
break; break;
} }
} }
return 1; return 1;
} }
uint8_t* blockstore_impl_t::get_clean_entry_bitmap(uint64_t block_loc, int offset)
uint8_t *clean_entry_bitmap;
uint64_t meta_loc = block_loc >> block_order;
if (inmemory_meta)
uint64_t sector = (meta_loc / (meta_block_size / clean_entry_size)) * meta_block_size;
uint64_t pos = (meta_loc % (meta_block_size / clean_entry_size));
clean_entry_bitmap = (uint8_t*)(metadata_buffer + sector + pos*clean_entry_size + sizeof(clean_disk_entry) + offset);
clean_entry_bitmap = (uint8_t*)(clean_bitmap + meta_loc*2*clean_entry_bitmap_size + offset);
return clean_entry_bitmap;
int blockstore_impl_t::dequeue_read(blockstore_op_t *read_op) int blockstore_impl_t::dequeue_read(blockstore_op_t *read_op)
{ {
auto clean_it = clean_db.find(read_op->oid); auto clean_it = clean_db.find(read_op->oid);
@ -127,7 +93,7 @@ int blockstore_impl_t::dequeue_read(blockstore_op_t *read_op)
read_op->version = 0; read_op->version = 0;
read_op->retval = read_op->len; read_op->retval = read_op->len;
FINISH_OP(read_op); FINISH_OP(read_op);
return 2; return 1;
} }
uint64_t fulfilled = 0; uint64_t fulfilled = 0;
PRIV(read_op)->pending_ops = 0; PRIV(read_op)->pending_ops = 0;
@ -149,11 +115,6 @@ int blockstore_impl_t::dequeue_read(blockstore_op_t *read_op)
if (!result_version) if (!result_version)
{ {
result_version = dirty_it->first.version; result_version = dirty_it->first.version;
if (read_op->bitmap)
void *bmp_ptr = (clean_entry_bitmap_size > sizeof(void*) ? dirty_it->second.bitmap : &dirty_it->second.bitmap);
memcpy(read_op->bitmap, bmp_ptr, clean_entry_bitmap_size);
} }
if (!fulfill_read(read_op, fulfilled, dirty.offset, dirty.offset + dirty.len, if (!fulfill_read(read_op, fulfilled, dirty.offset, dirty.offset + dirty.len,
dirty.state, dirty_it->first.version, dirty.location + (IS_JOURNAL(dirty.state) ? 0 : dirty.offset))) dirty.state, dirty_it->first.version, dirty.location + (IS_JOURNAL(dirty.state) ? 0 : dirty.offset)))
@ -175,17 +136,12 @@ int blockstore_impl_t::dequeue_read(blockstore_op_t *read_op)
if (!result_version) if (!result_version)
{ {
result_version = clean_it->second.version; result_version = clean_it->second.version;
if (read_op->bitmap)
void *bmp_ptr = get_clean_entry_bitmap(clean_it->second.location, clean_entry_bitmap_size);
memcpy(read_op->bitmap, bmp_ptr, clean_entry_bitmap_size);
} }
if (fulfilled < read_op->len) if (fulfilled < read_op->len)
{ {
if (!clean_entry_bitmap_size) if (!clean_entry_bitmap_size)
{ {
if (!fulfill_read(read_op, fulfilled, 0, block_size, (BS_ST_BIG_WRITE | BS_ST_STABLE), 0, clean_it->second.location)) if (!fulfill_read(read_op, fulfilled, 0, block_size, ST_CURRENT, 0, clean_it->second.location))
{ {
// need to wait. undo added requests, don't dequeue op // need to wait. undo added requests, don't dequeue op
PRIV(read_op)->read_vec.clear(); PRIV(read_op)->read_vec.clear();
@ -194,7 +150,18 @@ int blockstore_impl_t::dequeue_read(blockstore_op_t *read_op)
} }
else else
{ {
uint8_t *clean_entry_bitmap = get_clean_entry_bitmap(clean_it->second.location, 0); uint64_t meta_loc = clean_it->second.location >> block_order;
uint8_t *clean_entry_bitmap;
if (inmemory_meta)
uint64_t sector = (meta_loc / (meta_block_size / clean_entry_size)) * meta_block_size;
uint64_t pos = (meta_loc % (meta_block_size / clean_entry_size));
clean_entry_bitmap = (uint8_t*)(metadata_buffer + sector + pos*clean_entry_size + sizeof(clean_disk_entry));
clean_entry_bitmap = (uint8_t*)(clean_bitmap + meta_loc*clean_entry_bitmap_size);
uint64_t bmp_start = 0, bmp_end = 0, bmp_size = block_size/bitmap_granularity; uint64_t bmp_start = 0, bmp_end = 0, bmp_size = block_size/bitmap_granularity;
while (bmp_start < bmp_size) while (bmp_start < bmp_size)
{ {
@ -205,8 +172,8 @@ int blockstore_impl_t::dequeue_read(blockstore_op_t *read_op)
if (bmp_end > bmp_start) if (bmp_end > bmp_start)
{ {
// fill with zeroes // fill with zeroes
assert(fulfill_read(read_op, fulfilled, bmp_start * bitmap_granularity, fulfill_read(read_op, fulfilled, bmp_start * bitmap_granularity,
bmp_end * bitmap_granularity, (BS_ST_DELETE | BS_ST_STABLE), 0, 0)); bmp_end * bitmap_granularity, ST_DEL_STABLE, 0, 0);
} }
bmp_start = bmp_end; bmp_start = bmp_end;
while (clean_entry_bitmap[bmp_end >> 3] & (1 << (bmp_end & 0x7)) && bmp_end < bmp_size) while (clean_entry_bitmap[bmp_end >> 3] & (1 << (bmp_end & 0x7)) && bmp_end < bmp_size)
@ -216,8 +183,7 @@ int blockstore_impl_t::dequeue_read(blockstore_op_t *read_op)
if (bmp_end > bmp_start) if (bmp_end > bmp_start)
{ {
if (!fulfill_read(read_op, fulfilled, bmp_start * bitmap_granularity, if (!fulfill_read(read_op, fulfilled, bmp_start * bitmap_granularity,
bmp_end * bitmap_granularity, (BS_ST_BIG_WRITE | BS_ST_STABLE), 0, bmp_end * bitmap_granularity, ST_CURRENT, 0, clean_it->second.location + bmp_start * bitmap_granularity))
clean_it->second.location + bmp_start * bitmap_granularity))
{ {
// need to wait. undo added requests, don't dequeue op // need to wait. undo added requests, don't dequeue op
PRIV(read_op)->read_vec.clear(); PRIV(read_op)->read_vec.clear();
@ -232,7 +198,7 @@ int blockstore_impl_t::dequeue_read(blockstore_op_t *read_op)
else if (fulfilled < read_op->len) else if (fulfilled < read_op->len)
{ {
// fill remaining parts with zeroes // fill remaining parts with zeroes
assert(fulfill_read(read_op, fulfilled, 0, block_size, (BS_ST_DELETE | BS_ST_STABLE), 0, 0)); fulfill_read(read_op, fulfilled, 0, block_size, ST_DEL_STABLE, 0, 0);
} }
assert(fulfilled == read_op->len); assert(fulfilled == read_op->len);
read_op->version = result_version; read_op->version = result_version;
@ -246,10 +212,10 @@ int blockstore_impl_t::dequeue_read(blockstore_op_t *read_op)
} }
read_op->retval = read_op->len; read_op->retval = read_op->len;
FINISH_OP(read_op); FINISH_OP(read_op);
return 2; return 1;
} }
read_op->retval = 0; read_op->retval = 0;
return 2; return 1;
} }
void blockstore_impl_t::handle_read_event(ring_data_t *data, blockstore_op_t *op) void blockstore_impl_t::handle_read_event(ring_data_t *data, blockstore_op_t *op)

View File

@ -1,6 +1,3 @@
// Copyright (c) Vitaliy Filippov, 2019+
// License: VNPL-1.1 (see for details)
#include "blockstore_impl.h" #include "blockstore_impl.h"
int blockstore_impl_t::dequeue_rollback(blockstore_op_t *op) int blockstore_impl_t::dequeue_rollback(blockstore_op_t *op)
@ -9,14 +6,10 @@ int blockstore_impl_t::dequeue_rollback(blockstore_op_t *op)
{ {
return continue_rollback(op); return continue_rollback(op);
} }
obj_ver_id *v, *nv; obj_ver_id* v;
int i, todo = op->len; int i, todo = op->len;
for (i = 0, v = (obj_ver_id*)op->buf, nv = (obj_ver_id*)op->buf; i < op->len; i++, v++, nv++) for (i = 0, v = (obj_ver_id*)op->buf; i < op->len; i++, v++)
{ {
if (nv != v)
*nv = *v;
// Check that there are some versions greater than v->version (which may be zero), // Check that there are some versions greater than v->version (which may be zero),
// check that they're unstable, synced, and not currently written to // check that they're unstable, synced, and not currently written to
auto dirty_it = dirty_db.lower_bound((obj_ver_id){ auto dirty_it = dirty_db.lower_bound((obj_ver_id){
@ -25,32 +18,31 @@ int blockstore_impl_t::dequeue_rollback(blockstore_op_t *op)
}); });
if (dirty_it == dirty_db.begin()) if (dirty_it == dirty_db.begin())
{ {
skip_ov: if (v->version == 0)
// Already rolled back, skip this object version {
todo--; // Already rolled back
nv--; // FIXME Skip this object version
continue; }
op->retval = -ENOENT;
return 1;
} }
else else
{ {
dirty_it--; dirty_it--;
if (dirty_it->first.oid != v->oid || dirty_it->first.version < v->version) if (dirty_it->first.oid != v->oid || dirty_it->first.version < v->version)
{ {
goto skip_ov; goto bad_op;
} }
while (dirty_it->first.oid == v->oid && dirty_it->first.version > v->version) while (dirty_it->first.oid == v->oid && dirty_it->first.version > v->version)
{ {
if (IS_IN_FLIGHT(dirty_it->second.state)) if (!IS_SYNCED(dirty_it->second.state) ||
// Object write is still in progress. Wait until the write request completes
return 0;
else if (!IS_SYNCED(dirty_it->second.state) ||
IS_STABLE(dirty_it->second.state)) IS_STABLE(dirty_it->second.state))
{ {
op->retval = -EBUSY; op->retval = -EBUSY;
return 2; return 1;
} }
if (dirty_it == dirty_db.begin()) if (dirty_it == dirty_db.begin())
{ {
@ -60,14 +52,6 @@ skip_ov:
} }
} }
} }
op->len = todo;
if (!todo)
// Already rolled back
op->retval = 0;
return 2;
// Check journal space // Check journal space
blockstore_journal_check_t space_check(this); blockstore_journal_check_t space_check(this);
if (!space_check.check_available(op, todo, sizeof(journal_entry_rollback), 0)) if (!space_check.check_available(op, todo, sizeof(journal_entry_rollback), 0))
@ -75,38 +59,70 @@ skip_ov:
return 0; return 0;
} }
// There is sufficient space. Get SQEs // There is sufficient space. Get SQEs
struct io_uring_sqe *sqe[space_check.sectors_to_write]; struct io_uring_sqe *sqe[space_check.sectors_required];
for (i = 0; i < space_check.sectors_to_write; i++) for (i = 0; i < space_check.sectors_required; i++)
{ {
} }
// Prepare and submit journal entries // Prepare and submit journal entries
auto cb = [this, op](ring_data_t *data) { handle_rollback_event(data, op); }; auto cb = [this, op](ring_data_t *data) { handle_rollback_event(data, op); };
int s = 0, cur_sector = -1; int s = 0, cur_sector = -1;
if ((journal_block_size - journal.in_sector_pos) < sizeof(journal_entry_rollback) &&
if (cur_sector == -1)
PRIV(op)->min_flushed_journal_sector = 1 + journal.cur_sector;
cur_sector = journal.cur_sector;
prepare_journal_sector_write(journal, cur_sector, sqe[s++], cb);
for (i = 0, v = (obj_ver_id*)op->buf; i < op->len; i++, v++) for (i = 0, v = (obj_ver_id*)op->buf; i < op->len; i++, v++)
{ {
if (!journal.entry_fits(sizeof(journal_entry_rollback)) && // FIXME This is here only for the purpose of tracking unstable_writes. Remove if not required
journal.sector_info[journal.cur_sector].dirty) // FIXME ...aaaand this is similar to blockstore_init.cpp - maybe dedup it?
auto dirty_it = dirty_db.lower_bound((obj_ver_id){
.oid = v->oid,
.version = UINT64_MAX,
uint64_t max_unstable = 0;
while (dirty_it != dirty_db.begin())
{ {
if (cur_sector == -1) dirty_it--;
PRIV(op)->min_flushed_journal_sector = 1 + journal.cur_sector; if (dirty_it->first.oid != v->oid)
prepare_journal_sector_write(journal, journal.cur_sector, sqe[s++], cb); break;
cur_sector = journal.cur_sector; else if (dirty_it->first.version <= v->version)
if (!IS_STABLE(dirty_it->second.state))
max_unstable = dirty_it->first.version;
auto unstab_it = unstable_writes.find(v->oid);
if (unstab_it != unstable_writes.end())
if (max_unstable == 0)
unstab_it->second = max_unstable;
} }
journal_entry_rollback *je = (journal_entry_rollback*) journal_entry_rollback *je = (journal_entry_rollback*)
prefill_single_journal_entry(journal, JE_ROLLBACK, sizeof(journal_entry_rollback)); prefill_single_journal_entry(journal, JE_ROLLBACK, sizeof(journal_entry_rollback));
journal.sector_info[journal.cur_sector].dirty = false;
je->oid = v->oid; je->oid = v->oid;
je->version = v->version; je->version = v->version;
je->crc32 = je_crc32((journal_entry*)je); je->crc32 = je_crc32((journal_entry*)je);
journal.crc32_last = je->crc32; journal.crc32_last = je->crc32;
if (cur_sector != journal.cur_sector)
if (cur_sector == -1)
PRIV(op)->min_flushed_journal_sector = 1 + journal.cur_sector;
cur_sector = journal.cur_sector;
prepare_journal_sector_write(journal, cur_sector, sqe[s++], cb);
} }
prepare_journal_sector_write(journal, journal.cur_sector, sqe[s++], cb);
assert(s == space_check.sectors_to_write);
if (cur_sector == -1)
PRIV(op)->min_flushed_journal_sector = 1 + journal.cur_sector;
PRIV(op)->max_flushed_journal_sector = 1 + journal.cur_sector; PRIV(op)->max_flushed_journal_sector = 1 + journal.cur_sector;
PRIV(op)->pending_ops = s; PRIV(op)->pending_ops = s;
PRIV(op)->op_state = 1; PRIV(op)->op_state = 1;
return 1; return 1;
} }
@ -126,8 +142,11 @@ resume_2:
resume_3: resume_3:
if (!disable_journal_fsync) if (!disable_journal_fsync)
{ {
io_uring_sqe *sqe; io_uring_sqe *sqe = get_sqe();
BS_SUBMIT_GET_SQE_DECL(sqe); if (!sqe)
return 0;
ring_data_t *data = ((ring_data_t*)sqe->user_data); ring_data_t *data = ((ring_data_t*)sqe->user_data);
my_uring_prep_fsync(sqe, journal.fd, IORING_FSYNC_DATASYNC); my_uring_prep_fsync(sqe, journal.fd, IORING_FSYNC_DATASYNC);
data->iov = { 0 }; data->iov = { 0 };
@ -142,58 +161,33 @@ resume_5:
int i; int i;
for (i = 0, v = (obj_ver_id*)op->buf; i < op->len; i++, v++) for (i = 0, v = (obj_ver_id*)op->buf; i < op->len; i++, v++)
{ {
mark_rolled_back(*v); // Erase dirty_db entries
auto rm_end = dirty_db.lower_bound((obj_ver_id){
.oid = v->oid,
.version = UINT64_MAX,
auto rm_start = rm_end;
assert(rm_start != dirty_db.begin());
while (1)
if (rm_start->first.oid != v->oid || rm_start->first.version <= v->version)
if (rm_start == dirty_db.begin())
erase_dirty(rm_start, rm_end, UINT64_MAX);
} }
flusher->mark_trim_possible(); journal.trim();
// Acknowledge op // Acknowledge op
op->retval = 0; op->retval = 0;
return 2; return 1;
void blockstore_impl_t::mark_rolled_back(const obj_ver_id & ov)
auto it = dirty_db.lower_bound((obj_ver_id){
.oid = ov.oid,
.version = UINT64_MAX,
if (it != dirty_db.begin())
uint64_t max_unstable = 0;
auto rm_start = it;
auto rm_end = it;
while (1)
if (it->first.oid != ov.oid)
else if (it->first.version <= ov.version)
if (!IS_STABLE(it->second.state))
max_unstable = it->first.version;
else if (IS_IN_FLIGHT(it->second.state) || IS_STABLE(it->second.state))
// Remove entry
rm_start = it;
if (it == dirty_db.begin())
if (rm_start != rm_end)
erase_dirty(rm_start, rm_end, UINT64_MAX);
auto unstab_it = unstable_writes.find(ov.oid);
if (unstab_it != unstable_writes.end())
if (max_unstable == 0)
unstab_it->second = max_unstable;
} }
void blockstore_impl_t::handle_rollback_event(ring_data_t *data, blockstore_op_t *op) void blockstore_impl_t::handle_rollback_event(ring_data_t *data, blockstore_op_t *op)
@ -201,6 +195,7 @@ void blockstore_impl_t::handle_rollback_event(ring_data_t *data, blockstore_op_t
live = true; live = true;
if (data->res != data->iov.iov_len) if (data->res != data->iov.iov_len)
{ {
throw std::runtime_error( throw std::runtime_error(
"write operation failed ("+std::to_string(data->res)+" != "+std::to_string(data->iov.iov_len)+ "write operation failed ("+std::to_string(data->res)+" != "+std::to_string(data->iov.iov_len)+
"). in-memory state is corrupted. AAAAAAAaaaaaaaaa!!!111" "). in-memory state is corrupted. AAAAAAAaaaaaaaaa!!!111"
@ -210,44 +205,19 @@ void blockstore_impl_t::handle_rollback_event(ring_data_t *data, blockstore_op_t
if (PRIV(op)->pending_ops == 0) if (PRIV(op)->pending_ops == 0)
{ {
PRIV(op)->op_state++; PRIV(op)->op_state++;
ringloop->wakeup(); if (!continue_rollback(op))
} }
} }
void blockstore_impl_t::erase_dirty(blockstore_dirty_db_t::iterator dirty_start, blockstore_dirty_db_t::iterator dirty_end, uint64_t clean_loc) void blockstore_impl_t::erase_dirty(blockstore_dirty_db_t::iterator dirty_start, blockstore_dirty_db_t::iterator dirty_end, uint64_t clean_loc)
{ {
if (dirty_end == dirty_start)
auto dirty_it = dirty_end; auto dirty_it = dirty_end;
dirty_it--; while (dirty_it != dirty_start)
if (IS_DELETE(dirty_it->second.state))
{ {
object_id oid = dirty_it->first.oid;
printf("Unblock writes-after-delete %lx:%lx v%lx\n", oid.inode, oid.stripe, dirty_it->first.version);
dirty_it = dirty_end;
// Unblock operations blocked by delete flushing
uint32_t next_state = BS_ST_IN_FLIGHT;
while (dirty_it != dirty_db.end() && dirty_it->first.oid == oid)
if ((dirty_it->second.state & BS_ST_WORKFLOW_MASK) == BS_ST_WAIT_DEL)
dirty_it->second.state = (dirty_it->second.state & ~BS_ST_WORKFLOW_MASK) | next_state;
if (IS_BIG_WRITE(dirty_it->second.state))
next_state = BS_ST_WAIT_BIG;
dirty_it = dirty_end;
dirty_it--; dirty_it--;
while (1)
if (IS_BIG_WRITE(dirty_it->second.state) && dirty_it->second.location != clean_loc) if (IS_BIG_WRITE(dirty_it->second.state) && dirty_it->second.location != clean_loc)
{ {
@ -255,27 +225,15 @@ void blockstore_impl_t::erase_dirty(blockstore_dirty_db_t::iterator dirty_start,
#endif #endif
data_alloc->set(dirty_it->second.location >> block_order, false); data_alloc->set(dirty_it->second.location >> block_order, false);
} }
int used = --journal.used_sectors[dirty_it->second.journal_sector];
printf( printf("remove usage of journal offset %lu by %lu:%lu v%lu\n", dirty_it->second.journal_sector,
"remove usage of journal offset %08lx by %lx:%lx v%lu (%d refs)\n", dirty_it->second.journal_sector, dirty_it->first.oid.inode, dirty_it->first.oid.stripe, dirty_it->first.version);
dirty_it->first.oid.inode, dirty_it->first.oid.stripe, dirty_it->first.version, used
#endif #endif
int used = --journal.used_sectors[dirty_it->second.journal_sector];
if (used == 0) if (used == 0)
{ {
journal.used_sectors.erase(dirty_it->second.journal_sector); journal.used_sectors.erase(dirty_it->second.journal_sector);
} }
if (clean_entry_bitmap_size > sizeof(void*))
dirty_it->second.bitmap = NULL;
if (dirty_it == dirty_start)
} }
dirty_db.erase(dirty_start, dirty_end); dirty_db.erase(dirty_start, dirty_end);
} }

View File

@ -1,6 +1,3 @@
// Copyright (c) Vitaliy Filippov, 2019+
// License: VNPL-1.1 (see for details)
#include "blockstore_impl.h" #include "blockstore_impl.h"
// Stabilize small write: // Stabilize small write:
@ -60,24 +57,19 @@ int blockstore_impl_t::dequeue_stable(blockstore_op_t *op)
// No such object version // No such object version
op->retval = -ENOENT; op->retval = -ENOENT;
return 2; return 1;
} }
else else
{ {
// Already stable // Already stable
} }
} }
else if (IS_IN_FLIGHT(dirty_it->second.state)) else if (IS_UNSYNCED(dirty_it->second.state))
// Object write is still in progress. Wait until the write request completes
return 0;
else if (!IS_SYNCED(dirty_it->second.state))
{ {
// Object not synced yet. Caller must sync it first // Object not synced yet. Caller must sync it first
op->retval = -EBUSY; op->retval = -EBUSY;
return 2; return 1;
} }
else if (!IS_STABLE(dirty_it->second.state)) else if (!IS_STABLE(dirty_it->second.state))
{ {
@ -89,7 +81,7 @@ int blockstore_impl_t::dequeue_stable(blockstore_op_t *op)
// Already stable // Already stable
op->retval = 0; op->retval = 0;
return 2; return 1;
} }
// Check journal space // Check journal space
blockstore_journal_check_t space_check(this); blockstore_journal_check_t space_check(this);
@ -98,39 +90,50 @@ int blockstore_impl_t::dequeue_stable(blockstore_op_t *op)
return 0; return 0;
} }
// There is sufficient space. Get SQEs // There is sufficient space. Get SQEs
struct io_uring_sqe *sqe[space_check.sectors_to_write]; struct io_uring_sqe *sqe[space_check.sectors_required];
for (i = 0; i < space_check.sectors_to_write; i++) for (i = 0; i < space_check.sectors_required; i++)
{ {
} }
// Prepare and submit journal entries // Prepare and submit journal entries
auto cb = [this, op](ring_data_t *data) { handle_stable_event(data, op); }; auto cb = [this, op](ring_data_t *data) { handle_stable_event(data, op); };
int s = 0, cur_sector = -1; int s = 0, cur_sector = -1;
if ((journal_block_size - journal.in_sector_pos) < sizeof(journal_entry_stable) &&
if (cur_sector == -1)
PRIV(op)->min_flushed_journal_sector = 1 + journal.cur_sector;
cur_sector = journal.cur_sector;
prepare_journal_sector_write(journal, cur_sector, sqe[s++], cb);
for (i = 0, v = (obj_ver_id*)op->buf; i < op->len; i++, v++) for (i = 0, v = (obj_ver_id*)op->buf; i < op->len; i++, v++)
{ {
// FIXME: Only stabilize versions that aren't stable yet // FIXME: Only stabilize versions that aren't stable yet
if (!journal.entry_fits(sizeof(journal_entry_stable)) && auto unstab_it = unstable_writes.find(v->oid);
journal.sector_info[journal.cur_sector].dirty) if (unstab_it != unstable_writes.end() &&
unstab_it->second <= v->version)
{ {
if (cur_sector == -1) unstable_writes.erase(unstab_it);
PRIV(op)->min_flushed_journal_sector = 1 + journal.cur_sector;
prepare_journal_sector_write(journal, journal.cur_sector, sqe[s++], cb);
cur_sector = journal.cur_sector;
} }
journal_entry_stable *je = (journal_entry_stable*) journal_entry_stable *je = (journal_entry_stable*)
prefill_single_journal_entry(journal, JE_STABLE, sizeof(journal_entry_stable)); prefill_single_journal_entry(journal, JE_STABLE, sizeof(journal_entry_stable));
journal.sector_info[journal.cur_sector].dirty = false;
je->oid = v->oid; je->oid = v->oid;
je->version = v->version; je->version = v->version;
je->crc32 = je_crc32((journal_entry*)je); je->crc32 = je_crc32((journal_entry*)je);
journal.crc32_last = je->crc32; journal.crc32_last = je->crc32;
if (cur_sector != journal.cur_sector)
if (cur_sector == -1)
PRIV(op)->min_flushed_journal_sector = 1 + journal.cur_sector;
cur_sector = journal.cur_sector;
prepare_journal_sector_write(journal, cur_sector, sqe[s++], cb);
} }
prepare_journal_sector_write(journal, journal.cur_sector, sqe[s++], cb);
assert(s == space_check.sectors_to_write);
if (cur_sector == -1)
PRIV(op)->min_flushed_journal_sector = 1 + journal.cur_sector;
PRIV(op)->max_flushed_journal_sector = 1 + journal.cur_sector; PRIV(op)->max_flushed_journal_sector = 1 + journal.cur_sector;
PRIV(op)->pending_ops = s; PRIV(op)->pending_ops = s;
PRIV(op)->op_state = 1; PRIV(op)->op_state = 1;
return 1; return 1;
} }
@ -150,8 +153,16 @@ resume_2:
resume_3: resume_3:
if (!disable_journal_fsync) if (!disable_journal_fsync)
{ {
io_uring_sqe *sqe; {
BS_SUBMIT_GET_SQE_DECL(sqe); timespec now;
clock_gettime(CLOCK_REALTIME, &now);
printf("get_sqe %s %d %ld.%06ld\n", __FILE__, __LINE__, now.tv_sec, now.tv_nsec/1000);
io_uring_sqe *sqe = get_sqe();
if (!sqe)
return 0;
ring_data_t *data = ((ring_data_t*)sqe->user_data); ring_data_t *data = ((ring_data_t*)sqe->user_data);
my_uring_prep_fsync(sqe, journal.fd, IORING_FSYNC_DATASYNC); my_uring_prep_fsync(sqe, journal.fd, IORING_FSYNC_DATASYNC);
data->iov = { 0 }; data->iov = { 0 };
@ -168,56 +179,48 @@ resume_5:
for (i = 0, v = (obj_ver_id*)op->buf; i < op->len; i++, v++) for (i = 0, v = (obj_ver_id*)op->buf; i < op->len; i++, v++)
{ {
// Mark all dirty_db entries up to op->version as stable // Mark all dirty_db entries up to op->version as stable
mark_stable(*v); auto dirty_it = dirty_db.find(*v);
if (dirty_it != dirty_db.end())
while (1)
if (dirty_it->second.state == ST_J_SYNCED)
dirty_it->second.state = ST_J_STABLE;
else if (dirty_it->second.state == ST_D_SYNCED)
dirty_it->second.state = ST_D_STABLE;
else if (dirty_it->second.state == ST_DEL_SYNCED)
dirty_it->second.state = ST_DEL_STABLE;
else if (IS_STABLE(dirty_it->second.state))
if (dirty_it == dirty_db.begin())
if (dirty_it->first.oid != v->oid)
printf("enqueue_flush %lu:%lu v%lu\n", v->oid.inode, v->oid.stripe, v->version);
} }
// Acknowledge op // Acknowledge op
op->retval = 0; op->retval = 0;
return 2; return 1;
void blockstore_impl_t::mark_stable(const obj_ver_id & v)
auto dirty_it = dirty_db.find(v);
if (dirty_it != dirty_db.end())
while (1)
if ((dirty_it->second.state & BS_ST_WORKFLOW_MASK) == BS_ST_SYNCED)
dirty_it->second.state = (dirty_it->second.state & ~BS_ST_WORKFLOW_MASK) | BS_ST_STABLE;
// Allocations and deletions are counted when they're stabilized
if (IS_BIG_WRITE(dirty_it->second.state))
inode_space_stats[dirty_it->first.oid.inode] += block_size;
else if (IS_DELETE(dirty_it->second.state))
inode_space_stats[dirty_it->first.oid.inode] -= block_size;
else if (IS_STABLE(dirty_it->second.state))
if (dirty_it == dirty_db.begin())
if (dirty_it->first.oid != v.oid)
auto unstab_it = unstable_writes.find(v.oid);
if (unstab_it != unstable_writes.end() &&
unstab_it->second <= v.version)
} }
void blockstore_impl_t::handle_stable_event(ring_data_t *data, blockstore_op_t *op) void blockstore_impl_t::handle_stable_event(ring_data_t *data, blockstore_op_t *op)
@ -225,6 +228,7 @@ void blockstore_impl_t::handle_stable_event(ring_data_t *data, blockstore_op_t *
live = true; live = true;
if (data->res != data->iov.iov_len) if (data->res != data->iov.iov_len)
{ {
throw std::runtime_error( throw std::runtime_error(
"write operation failed ("+std::to_string(data->res)+" != "+std::to_string(data->iov.iov_len)+ "write operation failed ("+std::to_string(data->res)+" != "+std::to_string(data->iov.iov_len)+
"). in-memory state is corrupted. AAAAAAAaaaaaaaaa!!!111" "). in-memory state is corrupted. AAAAAAAaaaaaaaaa!!!111"
@ -234,6 +238,9 @@ void blockstore_impl_t::handle_stable_event(ring_data_t *data, blockstore_op_t *
if (PRIV(op)->pending_ops == 0) if (PRIV(op)->pending_ops == 0)
{ {
PRIV(op)->op_state++; PRIV(op)->op_state++;
ringloop->wakeup(); if (!continue_stable(op))
} }
} }

View File

@ -1,6 +1,3 @@
// Copyright (c) Vitaliy Filippov, 2019+
// License: VNPL-1.1 (see for details)
#include "blockstore_impl.h" #include "blockstore_impl.h"
#define SYNC_HAS_SMALL 1 #define SYNC_HAS_SMALL 1
@ -12,15 +9,8 @@
#define SYNC_DONE 8 #define SYNC_DONE 8
int blockstore_impl_t::continue_sync(blockstore_op_t *op, bool queue_has_in_progress_sync) int blockstore_impl_t::dequeue_sync(blockstore_op_t *op)
{ {
if (immediate_commit == IMMEDIATE_ALL)
// We can return immediately because sync is only dequeued after all previous writes
op->retval = 0;
return 2;
if (PRIV(op)->op_state == 0) if (PRIV(op)->op_state == 0)
{ {
stop_sync_submitted = false; stop_sync_submitted = false;
@ -36,15 +26,34 @@ int blockstore_impl_t::continue_sync(blockstore_op_t *op, bool queue_has_in_prog
PRIV(op)->op_state = SYNC_HAS_SMALL; PRIV(op)->op_state = SYNC_HAS_SMALL;
else else
PRIV(op)->op_state = SYNC_DONE; PRIV(op)->op_state = SYNC_DONE;
// Always add sync to in_progress_syncs because we clear unsynced_big_writes and unsynced_small_writes
PRIV(op)->prev_sync_count = in_progress_syncs.size();
PRIV(op)->in_progress_ptr = in_progress_syncs.insert(in_progress_syncs.end(), op);
} }
// Always dequeue because we always add syncs to in_progress_syncs
return 1;
int blockstore_impl_t::continue_sync(blockstore_op_t *op)
auto cb = [this, op](ring_data_t *data) { handle_sync_event(data, op); };
if (PRIV(op)->op_state == SYNC_HAS_SMALL) if (PRIV(op)->op_state == SYNC_HAS_SMALL)
{ {
// No big writes, just fsync the journal // No big writes, just fsync the journal
for (; PRIV(op)->sync_small_checked < PRIV(op)->sync_small_writes.size(); PRIV(op)->sync_small_checked++)
if (IS_IN_FLIGHT(dirty_db[PRIV(op)->sync_small_writes[PRIV(op)->sync_small_checked]].state))
// Wait for small inflight writes to complete
return 0;
if (journal.sector_info[journal.cur_sector].dirty) if (journal.sector_info[journal.cur_sector].dirty)
{ {
// Write out the last journal sector if it happens to be dirty // Write out the last journal sector if it happens to be dirty
prepare_journal_sector_write(journal, journal.cur_sector, sqe, [this, op](ring_data_t *data) { handle_sync_event(data, op); }); prepare_journal_sector_write(journal, journal.cur_sector, sqe, cb);
PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 1 + journal.cur_sector; PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 1 + journal.cur_sector;
PRIV(op)->pending_ops = 1; PRIV(op)->pending_ops = 1;
@ -57,13 +66,21 @@ int blockstore_impl_t::continue_sync(blockstore_op_t *op, bool queue_has_in_prog
} }
if (PRIV(op)->op_state == SYNC_HAS_BIG) if (PRIV(op)->op_state == SYNC_HAS_BIG)
{ {
for (; PRIV(op)->sync_big_checked < PRIV(op)->sync_big_writes.size(); PRIV(op)->sync_big_checked++)
if (IS_IN_FLIGHT(dirty_db[PRIV(op)->sync_big_writes[PRIV(op)->sync_big_checked]].state))
// Wait for big inflight writes to complete
return 0;
// 1st step: fsync data // 1st step: fsync data
if (!disable_data_fsync) if (!disable_data_fsync)
{ {
BS_SUBMIT_GET_SQE(sqe, data); BS_SUBMIT_GET_SQE(sqe, data);
my_uring_prep_fsync(sqe, data_fd, IORING_FSYNC_DATASYNC); my_uring_prep_fsync(sqe, data_fd, IORING_FSYNC_DATASYNC);
data->iov = { 0 }; data->iov = { 0 };
data->callback = [this, op](ring_data_t *data) { handle_sync_event(data, op); }; data->callback = cb;
PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 0; PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 0;
PRIV(op)->pending_ops = 1; PRIV(op)->pending_ops = 1;
PRIV(op)->op_state = SYNC_DATA_SYNC_SENT; PRIV(op)->op_state = SYNC_DATA_SYNC_SENT;
@ -76,44 +93,47 @@ int blockstore_impl_t::continue_sync(blockstore_op_t *op, bool queue_has_in_prog
} }
if (PRIV(op)->op_state == SYNC_DATA_SYNC_DONE) if (PRIV(op)->op_state == SYNC_DATA_SYNC_DONE)
{ {
for (; PRIV(op)->sync_small_checked < PRIV(op)->sync_small_writes.size(); PRIV(op)->sync_small_checked++)
if (IS_IN_FLIGHT(dirty_db[PRIV(op)->sync_small_writes[PRIV(op)->sync_small_checked]].state))
// Wait for small inflight writes to complete
return 0;
// 2nd step: Data device is synced, prepare & write journal entries // 2nd step: Data device is synced, prepare & write journal entries
// Check space in the journal and journal memory buffers // Check space in the journal and journal memory buffers
blockstore_journal_check_t space_check(this); blockstore_journal_check_t space_check(this);
if (!space_check.check_available(op, PRIV(op)->sync_big_writes.size(), sizeof(journal_entry_big_write), JOURNAL_STABILIZE_RESERVATION)) if (!space_check.check_available(op, PRIV(op)->sync_big_writes.size(), sizeof(journal_entry_big_write), 0))
{ {
return 0; return 0;
} }
// Get SQEs. Don't bother about merging, submit each journal sector as a separate request // Get SQEs. Don't bother about merging, submit each journal sector as a separate request
struct io_uring_sqe *sqe[space_check.sectors_to_write]; struct io_uring_sqe *sqe[space_check.sectors_required];
for (int i = 0; i < space_check.sectors_to_write; i++) for (int i = 0; i < space_check.sectors_required; i++)
{ {
} }
// Prepare and submit journal entries // Prepare and submit journal entries
auto it = PRIV(op)->sync_big_writes.begin(); auto it = PRIV(op)->sync_big_writes.begin();
int s = 0, cur_sector = -1; int s = 0, cur_sector = -1;
if ((journal_block_size - journal.in_sector_pos) < sizeof(journal_entry_big_write) &&
if (cur_sector == -1)
PRIV(op)->min_flushed_journal_sector = 1 + journal.cur_sector;
cur_sector = journal.cur_sector;
prepare_journal_sector_write(journal, cur_sector, sqe[s++], cb);
while (it != PRIV(op)->sync_big_writes.end()) while (it != PRIV(op)->sync_big_writes.end())
{ {
if (!journal.entry_fits(sizeof(journal_entry_big_write)) && journal_entry_big_write *je = (journal_entry_big_write*)
journal.sector_info[journal.cur_sector].dirty) prefill_single_journal_entry(journal, JE_BIG_WRITE, sizeof(journal_entry_big_write));
if (cur_sector == -1)
PRIV(op)->min_flushed_journal_sector = 1 + journal.cur_sector;
prepare_journal_sector_write(journal, journal.cur_sector, sqe[s++], [this, op](ring_data_t *data) { handle_sync_event(data, op); });
cur_sector = journal.cur_sector;
journal_entry_big_write *je = (journal_entry_big_write*)prefill_single_journal_entry(
journal, (dirty_db[*it].state & BS_ST_INSTANT) ? JE_BIG_WRITE_INSTANT : JE_BIG_WRITE,
dirty_db[*it].journal_sector = journal.sector_info[journal.cur_sector].offset; dirty_db[*it].journal_sector = journal.sector_info[journal.cur_sector].offset;
journal.sector_info[journal.cur_sector].dirty = false;
journal.used_sectors[journal.sector_info[journal.cur_sector].offset]++; journal.used_sectors[journal.sector_info[journal.cur_sector].offset]++;
printf( printf("journal offset %lu is used by %lu:%lu v%lu\n", dirty_db[*it].journal_sector, it->oid.inode, it->oid.stripe, it->version);
"journal offset %08lx is used by %lx:%lx v%lu (%lu refs)\n",
dirty_db[*it].journal_sector, it->oid.inode, it->oid.stripe, it->version,
#endif #endif
je->oid = it->oid; je->oid = it->oid;
je->version = it->version; je->version = it->version;
@ -123,11 +143,14 @@ int blockstore_impl_t::continue_sync(blockstore_op_t *op, bool queue_has_in_prog
je->crc32 = je_crc32((journal_entry*)je); je->crc32 = je_crc32((journal_entry*)je);
journal.crc32_last = je->crc32; journal.crc32_last = je->crc32;
it++; it++;
if (cur_sector != journal.cur_sector)
if (cur_sector == -1)
PRIV(op)->min_flushed_journal_sector = 1 + journal.cur_sector;
cur_sector = journal.cur_sector;
prepare_journal_sector_write(journal, cur_sector, sqe[s++], cb);
} }
prepare_journal_sector_write(journal, journal.cur_sector, sqe[s++], [this, op](ring_data_t *data) { handle_sync_event(data, op); });
assert(s == space_check.sectors_to_write);
if (cur_sector == -1)
PRIV(op)->min_flushed_journal_sector = 1 + journal.cur_sector;
PRIV(op)->max_flushed_journal_sector = 1 + journal.cur_sector; PRIV(op)->max_flushed_journal_sector = 1 + journal.cur_sector;
PRIV(op)->pending_ops = s; PRIV(op)->pending_ops = s;
@ -140,7 +163,7 @@ int blockstore_impl_t::continue_sync(blockstore_op_t *op, bool queue_has_in_prog
BS_SUBMIT_GET_SQE(sqe, data); BS_SUBMIT_GET_SQE(sqe, data);
my_uring_prep_fsync(sqe, journal.fd, IORING_FSYNC_DATASYNC); my_uring_prep_fsync(sqe, journal.fd, IORING_FSYNC_DATASYNC);
data->iov = { 0 }; data->iov = { 0 };
data->callback = [this, op](ring_data_t *data) { handle_sync_event(data, op); }; data->callback = cb;
PRIV(op)->pending_ops = 1; PRIV(op)->pending_ops = 1;
return 1; return 1;
@ -150,10 +173,9 @@ int blockstore_impl_t::continue_sync(blockstore_op_t *op, bool queue_has_in_prog
PRIV(op)->op_state = SYNC_DONE; PRIV(op)->op_state = SYNC_DONE;
} }
} }
if (PRIV(op)->op_state == SYNC_DONE && !queue_has_in_progress_sync) if (PRIV(op)->op_state == SYNC_DONE)
{ {
ack_sync(op); return ack_sync(op);
return 2;
} }
return 1; return 1;
} }
@ -185,37 +207,59 @@ void blockstore_impl_t::handle_sync_event(ring_data_t *data, blockstore_op_t *op
else if (PRIV(op)->op_state == SYNC_JOURNAL_SYNC_SENT) else if (PRIV(op)->op_state == SYNC_JOURNAL_SYNC_SENT)
{ {
PRIV(op)->op_state = SYNC_DONE; PRIV(op)->op_state = SYNC_DONE;
} }
else else
{ {
throw std::runtime_error("BUG: unexpected sync op state"); throw std::runtime_error("BUG: unexpected sync op state");
} }
} }
} }
void blockstore_impl_t::ack_sync(blockstore_op_t *op) int blockstore_impl_t::ack_sync(blockstore_op_t *op)
if (PRIV(op)->op_state == SYNC_DONE && PRIV(op)->prev_sync_count == 0)
// Remove dependency of subsequent syncs
auto it = PRIV(op)->in_progress_ptr;
int done_syncs = 1;
// Acknowledge sync
while (it != in_progress_syncs.end())
auto & next_sync = *it++;
PRIV(next_sync)->prev_sync_count -= done_syncs;
if (PRIV(next_sync)->prev_sync_count == 0 && PRIV(next_sync)->op_state == SYNC_DONE)
// Acknowledge next_sync
return 2;
return 0;
void blockstore_impl_t::ack_one_sync(blockstore_op_t *op)
{ {
// Handle states // Handle states
for (auto it = PRIV(op)->sync_big_writes.begin(); it != PRIV(op)->sync_big_writes.end(); it++) for (auto it = PRIV(op)->sync_big_writes.begin(); it != PRIV(op)->sync_big_writes.end(); it++)
{ {
printf("Ack sync big %lx:%lx v%lu\n", it->oid.inode, it->oid.stripe, it->version); printf("Ack sync big %lu:%lu v%lu\n", it->oid.inode, it->oid.stripe, it->version);
#endif #endif
auto & unstab = unstable_writes[it->oid]; auto & unstab = unstable_writes[it->oid];
unstab = unstab < it->version ? it->version : unstab; unstab = unstab < it->version ? it->version : unstab;
auto dirty_it = dirty_db.find(*it); auto dirty_it = dirty_db.find(*it);
dirty_it->second.state = ((dirty_it->second.state & ~BS_ST_WORKFLOW_MASK) | BS_ST_SYNCED); dirty_it->second.state = ST_D_SYNCED;
if (dirty_it->second.state & BS_ST_INSTANT)
dirty_it++; dirty_it++;
while (dirty_it != dirty_db.end() && dirty_it->first.oid == it->oid) while (dirty_it != dirty_db.end() && dirty_it->first.oid == it->oid)
{ {
if ((dirty_it->second.state & BS_ST_WORKFLOW_MASK) == BS_ST_WAIT_BIG) if (dirty_it->second.state == ST_J_WAIT_BIG)
{ {
dirty_it->second.state = (dirty_it->second.state & ~BS_ST_WORKFLOW_MASK) | BS_ST_IN_FLIGHT; dirty_it->second.state = ST_J_IN_FLIGHT;
} }
dirty_it++; dirty_it++;
} }
@ -223,25 +267,13 @@ void blockstore_impl_t::ack_sync(blockstore_op_t *op)
for (auto it = PRIV(op)->sync_small_writes.begin(); it != PRIV(op)->sync_small_writes.end(); it++) for (auto it = PRIV(op)->sync_small_writes.begin(); it != PRIV(op)->sync_small_writes.end(); it++)
{ {
printf("Ack sync small %lx:%lx v%lu\n", it->oid.inode, it->oid.stripe, it->version); printf("Ack sync small %lu:%lu v%lu\n", it->oid.inode, it->oid.stripe, it->version);
#endif #endif
auto & unstab = unstable_writes[it->oid]; auto & unstab = unstable_writes[it->oid];
unstab = unstab < it->version ? it->version : unstab; unstab = unstab < it->version ? it->version : unstab;
if (dirty_db[*it].state == (BS_ST_DELETE | BS_ST_WRITTEN)) dirty_db[*it].state = dirty_db[*it].state == ST_DEL_WRITTEN ? ST_DEL_SYNCED : ST_J_SYNCED;
dirty_db[*it].state = (BS_ST_DELETE | BS_ST_SYNCED);
// Deletions are treated as immediately stable
dirty_db[*it].state = (dirty_db[*it].state & ~BS_ST_WORKFLOW_MASK) | BS_ST_SYNCED;
if (dirty_db[*it].state & BS_ST_INSTANT)
} }
op->retval = 0; op->retval = 0;
} }

View File

@ -1,19 +1,11 @@
// Copyright (c) Vitaliy Filippov, 2019+
// License: VNPL-1.1 (see for details)
#include "blockstore_impl.h" #include "blockstore_impl.h"
bool blockstore_impl_t::enqueue_write(blockstore_op_t *op) bool blockstore_impl_t::enqueue_write(blockstore_op_t *op)
{ {
// Check or assign version number // Check or assign version number
bool found = false, deleted = false, is_del = (op->opcode == BS_OP_DELETE); bool found = false, deleted = false, is_del = (op->opcode == BS_OP_DELETE);
bool wait_big = false, wait_del = false; bool is_inflight_big = false;
void *bmp = NULL;
uint64_t version = 1; uint64_t version = 1;
if (!is_del && clean_entry_bitmap_size > sizeof(void*))
bmp = calloc_or_die(1, clean_entry_bitmap_size);
if (dirty_db.size() > 0) if (dirty_db.size() > 0)
{ {
auto dirty_it = dirty_db.upper_bound((obj_ver_id){ auto dirty_it = dirty_db.upper_bound((obj_ver_id){
@ -26,14 +18,9 @@ bool blockstore_impl_t::enqueue_write(blockstore_op_t *op)
found = true; found = true;
version = dirty_it->first.version + 1; version = dirty_it->first.version + 1;
deleted = IS_DELETE(dirty_it->second.state); deleted = IS_DELETE(dirty_it->second.state);
wait_del = ((dirty_it->second.state & BS_ST_WORKFLOW_MASK) == BS_ST_WAIT_DEL); is_inflight_big = dirty_it->second.state >= ST_D_IN_FLIGHT &&
wait_big = (dirty_it->second.state & BS_ST_TYPE_MASK) == BS_ST_BIG_WRITE dirty_it->second.state < ST_D_SYNCED ||
? !IS_SYNCED(dirty_it->second.state) dirty_it->second.state == ST_J_WAIT_BIG;
: ((dirty_it->second.state & BS_ST_WORKFLOW_MASK) == BS_ST_WAIT_BIG);
if (clean_entry_bitmap_size > sizeof(void*))
memcpy(bmp, dirty_it->second.bitmap, clean_entry_bitmap_size);
bmp = dirty_it->second.bitmap;
} }
} }
if (!found) if (!found)
@ -42,55 +29,29 @@ bool blockstore_impl_t::enqueue_write(blockstore_op_t *op)
if (clean_it != clean_db.end()) if (clean_it != clean_db.end())
{ {
version = clean_it->second.version + 1; version = clean_it->second.version + 1;
void *bmp_ptr = get_clean_entry_bitmap(clean_it->second.location, clean_entry_bitmap_size);
memcpy((clean_entry_bitmap_size > sizeof(void*) ? bmp : &bmp), bmp_ptr, clean_entry_bitmap_size);
} }
else else
{ {
deleted = true; deleted = true;
} }
} }
if (op->version == 0)
op->version = version;
else if (op->version < version)
// Invalid version requested
op->retval = -EEXIST;
return false;
if (deleted && is_del) if (deleted && is_del)
{ {
// Already deleted // Already deleted
op->retval = 0; op->retval = 0;
return false; return false;
} }
PRIV(op)->real_version = 0; if (is_inflight_big && !is_del && !deleted && op->len < block_size &&
if (op->version == 0)
op->version = version;
else if (op->version < version)
// Implicit operations must be added like that: DEL [FLUSH] BIG [SYNC] SMALL SMALL
if (deleted || wait_del)
// It's allowed to write versions with low numbers over deletes
// However, we have to flush those deletes first as we use version number for ordering
printf("Write %lx:%lx v%lu over delete (real v%lu) offset=%u len=%u\n", op->oid.inode, op->oid.stripe, version, op->version, op->offset, op->len);
wait_del = true;
PRIV(op)->real_version = op->version;
op->version = version;
.oid = op->oid,
.version = version-1,
}, true);
// Invalid version requested
op->retval = -EEXIST;
if (!is_del && clean_entry_bitmap_size > sizeof(void*))
return false;
if (wait_big && !is_del && !deleted && op->len < block_size &&
immediate_commit != IMMEDIATE_ALL) immediate_commit != IMMEDIATE_ALL)
{ {
// Issue an additional sync so that the previous big write can reach the journal // Issue an additional sync so that the previous big write can reach the journal
@ -104,89 +65,30 @@ bool blockstore_impl_t::enqueue_write(blockstore_op_t *op)
} }
if (is_del) if (is_del)
printf("Delete %lx:%lx v%lu\n", op->oid.inode, op->oid.stripe, op->version); printf("Delete %lu:%lu v%lu\n", op->oid.inode, op->oid.stripe, op->version);
else if (!wait_del)
printf("Write %lx:%lx v%lu offset=%u len=%u\n", op->oid.inode, op->oid.stripe, op->version, op->offset, op->len);
// FIXME No strict need to add it into dirty_db here, it's just left
// from the previous implementation where reads waited for writes
uint32_t state;
if (is_del)
else else
{ printf("Write %lu:%lu v%lu offset=%u len=%u\n", op->oid.inode, op->oid.stripe, op->version, op->offset, op->len);
state = (op->len == block_size || deleted ? BS_ST_BIG_WRITE : BS_ST_SMALL_WRITE); #endif
if (wait_del) // No strict need to add it into dirty_db here, it's just left
state |= BS_ST_WAIT_DEL; // from the previous implementation where reads waited for writes
else if (state == BS_ST_SMALL_WRITE && wait_big)
state |= BS_ST_WAIT_BIG;
state |= BS_ST_IN_FLIGHT;
if (op->opcode == BS_OP_WRITE_STABLE)
state |= BS_ST_INSTANT;
if (op->bitmap)
// Only allow to overwrite part of the object bitmap respective to the write's offset/len
uint8_t *bmp_ptr = (uint8_t*)(clean_entry_bitmap_size > sizeof(void*) ? bmp : &bmp);
uint32_t bit = op->offset/bitmap_granularity;
uint32_t bits_left = op->len/bitmap_granularity;
while (!(bit % 8) && bits_left > 8)
// Copy bytes
bmp_ptr[bit/8] = ((uint8_t*)op->bitmap)[bit/8];
bit += 8;
bits_left -= 8;
while (bits_left > 0)
// Copy bits
bmp_ptr[bit/8] = (bmp_ptr[bit/8] & ~(1 << (bit%8)))
| (((uint8_t*)op->bitmap)[bit/8] & (1 << bit%8));
dirty_db.emplace((obj_ver_id){ dirty_db.emplace((obj_ver_id){
.oid = op->oid, .oid = op->oid,
.version = op->version, .version = op->version,
}, (dirty_entry){ }, (dirty_entry){
.state = state, .state = (uint32_t)(
: (op->len == block_size || deleted ? ST_D_IN_FLIGHT : (is_inflight_big ? ST_J_WAIT_BIG : ST_J_IN_FLIGHT))
.flags = 0, .flags = 0,
.location = 0, .location = 0,
.offset = is_del ? 0 : op->offset, .offset = is_del ? 0 : op->offset,
.len = is_del ? 0 : op->len, .len = is_del ? 0 : op->len,
.journal_sector = 0, .journal_sector = 0,
.bitmap = bmp,
}); });
return true; return true;
} }
void blockstore_impl_t::cancel_all_writes(blockstore_op_t *op, blockstore_dirty_db_t::iterator dirty_it, int retval)
while (dirty_it != dirty_db.end() && dirty_it->first.oid == op->oid)
if (clean_entry_bitmap_size > sizeof(void*))
bool found = false;
for (auto other_op: submit_queue)
if (!found && other_op == op)
found = true;
else if (found && other_op->oid == op->oid &&
(other_op->opcode == BS_OP_WRITE || other_op->opcode == BS_OP_WRITE_STABLE))
// Mark operations to cancel them
PRIV(other_op)->real_version = UINT64_MAX;
other_op->retval = retval;
op->retval = retval;
// First step of the write algorithm: dequeue operation and submit initial write(s) // First step of the write algorithm: dequeue operation and submit initial write(s)
int blockstore_impl_t::dequeue_write(blockstore_op_t *op) int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
{ {
@ -198,47 +100,11 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
.oid = op->oid, .oid = op->oid,
.version = op->version, .version = op->version,
}); });
assert(dirty_it != dirty_db.end()); if (dirty_it->second.state == ST_J_WAIT_BIG)
if ((dirty_it->second.state & BS_ST_WORKFLOW_MASK) < BS_ST_IN_FLIGHT)
// Don't dequeue
return 0;
if (PRIV(op)->real_version != 0)
if (PRIV(op)->real_version == UINT64_MAX)
// This is the flag value used to cancel operations
return 2;
// Restore original low version number for unblocked operations
printf("Restoring %lx:%lx version: v%lu -> v%lu\n", op->oid.inode, op->oid.stripe, op->version, PRIV(op)->real_version);
auto prev_it = dirty_it;
if (prev_it->first.oid == op->oid && prev_it->first.version >= PRIV(op)->real_version)
// Original version is still invalid
// All subsequent writes to the same object must be canceled too
cancel_all_writes(op, dirty_it, -EEXIST);
return 2;
op->version = PRIV(op)->real_version;
PRIV(op)->real_version = 0;
dirty_entry e = dirty_it->second;
dirty_it = dirty_db.emplace((obj_ver_id){
.oid = op->oid,
.version = op->version,
}, e).first;
if (write_iodepth >= max_write_iodepth)
{ {
return 0; return 0;
} }
if ((dirty_it->second.state & BS_ST_TYPE_MASK) == BS_ST_BIG_WRITE) else if (dirty_it->second.state == ST_D_IN_FLIGHT)
{ {
blockstore_journal_check_t space_check(this); blockstore_journal_check_t space_check(this);
if (!space_check.check_available(op, unsynced_big_writes.size() + 1, sizeof(journal_entry_big_write), JOURNAL_STABILIZE_RESERVATION)) if (!space_check.check_available(op, unsynced_big_writes.size() + 1, sizeof(journal_entry_big_write), JOURNAL_STABILIZE_RESERVATION))
@ -256,13 +122,13 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
PRIV(op)->wait_for = WAIT_FREE; PRIV(op)->wait_for = WAIT_FREE;
return 0; return 0;
} }
cancel_all_writes(op, dirty_it, -ENOSPC); op->retval = -ENOSPC;
return 2; FINISH_OP(op);
return 1;
} }
BS_SUBMIT_GET_SQE(sqe, data); BS_SUBMIT_GET_SQE(sqe, data);
dirty_it->second.location = loc << block_order; dirty_it->second.location = loc << block_order;
dirty_it->second.state = (dirty_it->second.state & ~BS_ST_WORKFLOW_MASK) | BS_ST_SUBMITTED; dirty_it->second.state = ST_D_SUBMITTED;
printf("Allocate block %lu\n", loc); printf("Allocate block %lu\n", loc);
#endif #endif
@ -302,7 +168,7 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
PRIV(op)->op_state = 1; PRIV(op)->op_state = 1;
} }
} }
else /* if ((dirty_it->second.state & BS_ST_TYPE_MASK) == BS_ST_SMALL_WRITE) */ else
{ {
// Small (journaled) write // Small (journaled) write
// First check if the journal has sufficient space // First check if the journal has sufficient space
@ -312,7 +178,6 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
{ {
return 0; return 0;
} }
// There is sufficient space. Get SQE(s) // There is sufficient space. Get SQE(s)
struct io_uring_sqe *sqe1 = NULL; struct io_uring_sqe *sqe1 = NULL;
if (immediate_commit != IMMEDIATE_NONE || if (immediate_commit != IMMEDIATE_NONE ||
@ -343,18 +208,12 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
} }
} }
// Then pre-fill journal entry // Then pre-fill journal entry
journal_entry_small_write *je = (journal_entry_small_write*)prefill_single_journal_entry( journal_entry_small_write *je = (journal_entry_small_write*)
journal, op->opcode == BS_OP_WRITE_STABLE ? JE_SMALL_WRITE_INSTANT : JE_SMALL_WRITE, prefill_single_journal_entry(journal, JE_SMALL_WRITE, sizeof(journal_entry_small_write));
sizeof(journal_entry_small_write) + clean_entry_bitmap_size
dirty_it->second.journal_sector = journal.sector_info[journal.cur_sector].offset; dirty_it->second.journal_sector = journal.sector_info[journal.cur_sector].offset;
journal.used_sectors[journal.sector_info[journal.cur_sector].offset]++; journal.used_sectors[journal.sector_info[journal.cur_sector].offset]++;
printf( printf("journal offset %lu is used by %lu:%lu v%lu\n", dirty_it->second.journal_sector, dirty_it->first.oid.inode, dirty_it->first.oid.stripe, dirty_it->first.version);
"journal offset %08lx is used by %lx:%lx v%lu (%lu refs)\n",
dirty_it->second.journal_sector, dirty_it->first.oid.inode, dirty_it->first.oid.stripe, dirty_it->first.version,
#endif #endif
// Figure out where data will be // Figure out where data will be
journal.next_free = (journal.next_free + op->len) <= journal.len ? journal.next_free : journal_block_size; journal.next_free = (journal.next_free + op->len) <= journal.len ? journal.next_free : journal_block_size;
@ -364,7 +223,6 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
je->len = op->len; je->len = op->len;
je->data_offset = journal.next_free; je->data_offset = journal.next_free;
je->crc32_data = crc32c(0, op->buf, op->len); je->crc32_data = crc32c(0, op->buf, op->len);
memcpy((void*)(je+1), (clean_entry_bitmap_size > sizeof(void*) ? dirty_it->second.bitmap : &dirty_it->second.bitmap), clean_entry_bitmap_size);
je->crc32 = je_crc32((journal_entry*)je); je->crc32 = je_crc32((journal_entry*)je);
journal.crc32_last = je->crc32; journal.crc32_last = je->crc32;
if (immediate_commit != IMMEDIATE_NONE) if (immediate_commit != IMMEDIATE_NONE)
@ -394,7 +252,7 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
// Zero-length overwrite. Allowed to bump object version in EC placement groups without actually writing data // Zero-length overwrite. Allowed to bump object version in EC placement groups without actually writing data
} }
dirty_it->second.location = journal.next_free; dirty_it->second.location = journal.next_free;
dirty_it->second.state = (dirty_it->second.state & ~BS_ST_WORKFLOW_MASK) | BS_ST_SUBMITTED; dirty_it->second.state = ST_J_SUBMITTED;
journal.next_free += op->len; journal.next_free += op->len;
if (journal.next_free >= journal.len) if (journal.next_free >= journal.len)
{ {
@ -411,13 +269,14 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
if (!PRIV(op)->pending_ops) if (!PRIV(op)->pending_ops)
{ {
PRIV(op)->op_state = 4; PRIV(op)->op_state = 4;
return continue_write(op); continue_write(op);
} }
else else
{ {
PRIV(op)->op_state = 3; PRIV(op)->op_state = 3;
} }
} }
return 1; return 1;
} }
@ -425,43 +284,40 @@ int blockstore_impl_t::continue_write(blockstore_op_t *op)
{ {
io_uring_sqe *sqe = NULL; io_uring_sqe *sqe = NULL;
journal_entry_big_write *je; journal_entry_big_write *je;
int op_state = PRIV(op)->op_state;
if (op_state != 2 && op_state != 4)
// In progress
return 1;
auto dirty_it = dirty_db.find((obj_ver_id){ auto dirty_it = dirty_db.find((obj_ver_id){
.oid = op->oid, .oid = op->oid,
.version = op->version, .version = op->version,
}); });
assert(dirty_it != dirty_db.end()); if (PRIV(op)->op_state == 2)
if (op_state == 2)
goto resume_2; goto resume_2;
else if (op_state == 4) else if (PRIV(op)->op_state == 4)
goto resume_4; goto resume_4;
return 1;
resume_2: resume_2:
// Only for the immediate_commit mode: prepare and submit big_write journal entry // Only for the immediate_commit mode: prepare and submit big_write journal entry
je = (journal_entry_big_write*)prefill_single_journal_entry( timespec now;
journal, op->opcode == BS_OP_WRITE_STABLE ? JE_BIG_WRITE_INSTANT : JE_BIG_WRITE, clock_gettime(CLOCK_REALTIME, &now);
sizeof(journal_entry_big_write) + clean_entry_bitmap_size printf("get_sqe %s %d %ld.%06ld\n", __FILE__, __LINE__, now.tv_sec, now.tv_nsec/1000);
); }
sqe = get_sqe();
if (!sqe)
return 0;
je = (journal_entry_big_write*)prefill_single_journal_entry(journal, JE_BIG_WRITE, sizeof(journal_entry_big_write));
dirty_it->second.journal_sector = journal.sector_info[journal.cur_sector].offset; dirty_it->second.journal_sector = journal.sector_info[journal.cur_sector].offset;
journal.sector_info[journal.cur_sector].dirty = false;
journal.used_sectors[journal.sector_info[journal.cur_sector].offset]++; journal.used_sectors[journal.sector_info[journal.cur_sector].offset]++;
printf( printf("journal offset %lu is used by %lu:%lu v%lu\n", journal.sector_info[journal.cur_sector].offset, op->oid.inode, op->oid.stripe, op->version);
"journal offset %08lx is used by %lx:%lx v%lu (%lu refs)\n",
journal.sector_info[journal.cur_sector].offset, op->oid.inode, op->oid.stripe, op->version,
#endif #endif
je->oid = op->oid; je->oid = op->oid;
je->version = op->version; je->version = op->version;
je->offset = op->offset; je->offset = op->offset;
je->len = op->len; je->len = op->len;
je->location = dirty_it->second.location; je->location = dirty_it->second.location;
memcpy((void*)(je+1), (clean_entry_bitmap_size > sizeof(void*) ? dirty_it->second.bitmap : &dirty_it->second.bitmap), clean_entry_bitmap_size);
je->crc32 = je_crc32((journal_entry*)je); je->crc32 = je_crc32((journal_entry*)je);
journal.crc32_last = je->crc32; journal.crc32_last = je->crc32;
prepare_journal_sector_write(journal, journal.cur_sector, sqe, prepare_journal_sector_write(journal, journal.cur_sector, sqe,
@ -472,10 +328,15 @@ resume_2:
return 1; return 1;
resume_4: resume_4:
// Switch object state // Switch object state
timespec now;
clock_gettime(CLOCK_REALTIME, &now);
printf("write_done %s %d %ld.%06ld\n", __FILE__, __LINE__, now.tv_sec, now.tv_nsec/1000);
printf("Ack write %lx:%lx v%lu = state %x\n", op->oid.inode, op->oid.stripe, op->version, dirty_it->second.state); printf("Ack write %lu:%lu v%lu = %d\n", op->oid.inode, op->oid.stripe, op->version, dirty_it->second.state);
#endif #endif
bool imm = (dirty_it->second.state & BS_ST_TYPE_MASK) == BS_ST_BIG_WRITE bool imm = dirty_it->second.state == ST_D_SUBMITTED
? (immediate_commit == IMMEDIATE_ALL) ? (immediate_commit == IMMEDIATE_ALL)
: (immediate_commit != IMMEDIATE_NONE); : (immediate_commit != IMMEDIATE_NONE);
if (imm) if (imm)
@ -483,30 +344,35 @@ resume_4:
auto & unstab = unstable_writes[op->oid]; auto & unstab = unstable_writes[op->oid];
unstab = unstab < op->version ? op->version : unstab; unstab = unstab < op->version ? op->version : unstab;
} }
dirty_it->second.state = (dirty_it->second.state & ~BS_ST_WORKFLOW_MASK) if (dirty_it->second.state == ST_J_SUBMITTED)
if (imm && ((dirty_it->second.state & BS_ST_TYPE_MASK) == BS_ST_DELETE || (dirty_it->second.state & BS_ST_INSTANT)))
{ {
// Deletions are treated as immediately stable dirty_it->second.state = imm ? ST_J_SYNCED : ST_J_WRITTEN;
mark_stable(dirty_it->first); }
else if (dirty_it->second.state == ST_D_SUBMITTED)
dirty_it->second.state = imm ? ST_D_SYNCED : ST_D_WRITTEN;
else if (dirty_it->second.state == ST_DEL_SUBMITTED)
dirty_it->second.state = imm ? ST_DEL_SYNCED : ST_DEL_WRITTEN;
} }
if (immediate_commit == IMMEDIATE_ALL) if (immediate_commit == IMMEDIATE_ALL)
{ {
dirty_it++; dirty_it++;
while (dirty_it != dirty_db.end() && dirty_it->first.oid == op->oid) while (dirty_it != dirty_db.end() && dirty_it->first.oid == op->oid)
{ {
if ((dirty_it->second.state & BS_ST_WORKFLOW_MASK) == BS_ST_WAIT_BIG) if (dirty_it->second.state == ST_J_WAIT_BIG)
{ {
dirty_it->second.state = (dirty_it->second.state & ~BS_ST_WORKFLOW_MASK) | BS_ST_IN_FLIGHT; dirty_it->second.state = ST_J_IN_FLIGHT;
} }
dirty_it++; dirty_it++;
} }
} }
// Acknowledge write // Acknowledge write
op->retval = op->len; op->retval = op->len;
return 2; return 1;
} }
void blockstore_impl_t::handle_write_event(ring_data_t *data, blockstore_op_t *op) void blockstore_impl_t::handle_write_event(ring_data_t *data, blockstore_op_t *op)
@ -514,6 +380,7 @@ void blockstore_impl_t::handle_write_event(ring_data_t *data, blockstore_op_t *o
live = true; live = true;
if (data->res != data->iov.iov_len) if (data->res != data->iov.iov_len)
{ {
// FIXME: our state becomes corrupted after a write error. maybe do something better than just die // FIXME: our state becomes corrupted after a write error. maybe do something better than just die
throw std::runtime_error( throw std::runtime_error(
"write operation failed ("+std::to_string(data->res)+" != "+std::to_string(data->iov.iov_len)+ "write operation failed ("+std::to_string(data->res)+" != "+std::to_string(data->iov.iov_len)+
@ -525,7 +392,10 @@ void blockstore_impl_t::handle_write_event(ring_data_t *data, blockstore_op_t *o
{ {
release_journal_sectors(op); release_journal_sectors(op);
PRIV(op)->op_state++; PRIV(op)->op_state++;
ringloop->wakeup(); if (!continue_write(op))
} }
} }
@ -538,15 +408,11 @@ void blockstore_impl_t::release_journal_sectors(blockstore_op_t *op)
uint64_t s = PRIV(op)->min_flushed_journal_sector; uint64_t s = PRIV(op)->min_flushed_journal_sector;
while (1) while (1)
{ {
journal.sector_info[s-1].flush_count--; journal.sector_info[s-1].usage_count--;
if (s != (1+journal.cur_sector) && journal.sector_info[s-1].flush_count == 0) if (s != (1+journal.cur_sector) && journal.sector_info[s-1].usage_count == 0)
{ {
// We know for sure that we won't write into this sector anymore // We know for sure that we won't write into this sector anymore
uint64_t new_ds = journal.sector_info[s-1].offset + journal.block_size; uint64_t new_ds = journal.sector_info[s-1].offset + journal.block_size;
if (new_ds >= journal.len)
new_ds = journal.block_size;
if ((journal.dirty_start + (journal.dirty_start >= journal.used_start ? 0 : journal.len)) < if ((journal.dirty_start + (journal.dirty_start >= journal.used_start ? 0 : journal.len)) <
(new_ds + (new_ds >= journal.used_start ? 0 : journal.len))) (new_ds + (new_ds >= journal.used_start ? 0 : journal.len)))
{ {
@ -563,21 +429,15 @@ void blockstore_impl_t::release_journal_sectors(blockstore_op_t *op)
int blockstore_impl_t::dequeue_del(blockstore_op_t *op) int blockstore_impl_t::dequeue_del(blockstore_op_t *op)
{ {
if (PRIV(op)->op_state)
return continue_write(op);
auto dirty_it = dirty_db.find((obj_ver_id){ auto dirty_it = dirty_db.find((obj_ver_id){
.oid = op->oid, .oid = op->oid,
.version = op->version, .version = op->version,
}); });
assert(dirty_it != dirty_db.end());
blockstore_journal_check_t space_check(this); blockstore_journal_check_t space_check(this);
if (!space_check.check_available(op, 1, sizeof(journal_entry_del), JOURNAL_STABILIZE_RESERVATION)) if (!space_check.check_available(op, 1, sizeof(journal_entry_del), 0))
{ {
return 0; return 0;
} }
io_uring_sqe *sqe = NULL; io_uring_sqe *sqe = NULL;
if (immediate_commit != IMMEDIATE_NONE || if (immediate_commit != IMMEDIATE_NONE ||
(journal_block_size - journal.in_sector_pos) < sizeof(journal_entry_del) && (journal_block_size - journal.in_sector_pos) < sizeof(journal_entry_del) &&
@ -602,32 +462,24 @@ int blockstore_impl_t::dequeue_del(blockstore_op_t *op)
} }
} }
// Pre-fill journal entry // Pre-fill journal entry
journal_entry_del *je = (journal_entry_del*)prefill_single_journal_entry( journal_entry_del *je = (journal_entry_del*)
journal, JE_DELETE, sizeof(struct journal_entry_del) prefill_single_journal_entry(journal, JE_DELETE, sizeof(struct journal_entry_del));
dirty_it->second.journal_sector = journal.sector_info[journal.cur_sector].offset; dirty_it->second.journal_sector = journal.sector_info[journal.cur_sector].offset;
journal.used_sectors[journal.sector_info[journal.cur_sector].offset]++; journal.used_sectors[journal.sector_info[journal.cur_sector].offset]++;
printf( printf("journal offset %lu is used by %lu:%lu v%lu\n", dirty_it->second.journal_sector, dirty_it->first.oid.inode, dirty_it->first.oid.stripe, dirty_it->first.version);
"journal offset %08lx is used by %lx:%lx v%lu (%lu refs)\n",
dirty_it->second.journal_sector, dirty_it->first.oid.inode, dirty_it->first.oid.stripe, dirty_it->first.version,
#endif #endif
je->oid = op->oid; je->oid = op->oid;
je->version = op->version; je->version = op->version;
je->crc32 = je_crc32((journal_entry*)je); je->crc32 = je_crc32((journal_entry*)je);
journal.crc32_last = je->crc32; journal.crc32_last = je->crc32;
dirty_it->second.state = BS_ST_DELETE | BS_ST_SUBMITTED; dirty_it->second.state = ST_DEL_SUBMITTED;
if (immediate_commit != IMMEDIATE_NONE) if (immediate_commit != IMMEDIATE_NONE)
{ {
prepare_journal_sector_write(journal, journal.cur_sector, sqe, cb); prepare_journal_sector_write(journal, journal.cur_sector, sqe, cb);
PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 1 + journal.cur_sector; PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 1 + journal.cur_sector;
PRIV(op)->pending_ops++; PRIV(op)->pending_ops++;
} // Remember small write as unsynced
// Remember delete as unsynced
unsynced_small_writes.push_back((obj_ver_id){ unsynced_small_writes.push_back((obj_ver_id){
.oid = op->oid, .oid = op->oid,
.version = op->version, .version = op->version,
@ -636,7 +488,7 @@ int blockstore_impl_t::dequeue_del(blockstore_op_t *op)
if (!PRIV(op)->pending_ops) if (!PRIV(op)->pending_ops)
{ {
PRIV(op)->op_state = 4; PRIV(op)->op_state = 4;
return continue_write(op); continue_write(op);
} }
else else
{ {

cluster_client.cpp Normal file
View File

@ -0,0 +1,357 @@
#include <unistd.h>
#include <fcntl.h>
#include <sys/socket.h>
#include <sys/epoll.h>
#include <netinet/tcp.h>
#include "cluster_client.h"
if (op_data)
if (rmw_buf)
if (buf)
// Note: reusing osd_op_t WILL currently lead to memory leaks
// So we don't reuse it, but free it every time
void cluster_client_t::connect_peer(uint64_t peer_osd, json11::Json address_list, int port)
if (wanted_peers.find(peer_osd) == wanted_peers.end())
wanted_peers[peer_osd] = (osd_wanted_peer_t){
.address_list = address_list,
.port = port,
wanted_peers[peer_osd].address_list = address_list;
wanted_peers[peer_osd].port = port;
wanted_peers[peer_osd].address_changed = true;
if (!wanted_peers[peer_osd].connecting &&
(time(NULL) - wanted_peers[peer_osd].last_connect_attempt) >= peer_connect_interval)
void cluster_client_t::try_connect_peer(uint64_t peer_osd)
auto wp_it = wanted_peers.find(peer_osd);
if (wp_it == wanted_peers.end())
if (osd_peer_fds.find(peer_osd) != osd_peer_fds.end())
auto & wp = wp_it->second;
if (wp.address_index >= wp.address_list.array_items().size())
wp.cur_addr = wp.address_list[wp.address_index].string_value();
wp.cur_port = wp.port;
try_connect_peer_addr(peer_osd, wp.cur_addr.c_str(), wp.cur_port);
void cluster_client_t::try_connect_peer_addr(osd_num_t peer_osd, const char *peer_host, int peer_port)
struct sockaddr_in addr;
int r;
if ((r = inet_pton(AF_INET, peer_host, &addr.sin_addr)) != 1)
on_connect_peer(peer_osd, -EINVAL);
addr.sin_family = AF_INET;
addr.sin_port = htons(peer_port ? peer_port : 11203);
int peer_fd = socket(AF_INET, SOCK_STREAM, 0);
if (peer_fd < 0)
on_connect_peer(peer_osd, -errno);
fcntl(peer_fd, F_SETFL, fcntl(peer_fd, F_GETFL, 0) | O_NONBLOCK);
int timeout_id = -1;
if (peer_connect_timeout > 0)
timeout_id = tfd->set_timer(1000*peer_connect_timeout, false, [this, peer_fd](int timer_id)
osd_num_t peer_osd = clients[peer_fd].osd_num;
on_connect_peer(peer_osd, -EIO);
r = connect(peer_fd, (sockaddr*)&addr, sizeof(addr));
if (r < 0 && errno != EINPROGRESS)
on_connect_peer(peer_osd, -errno);
assert(peer_osd != this->osd_num);
clients[peer_fd] = (osd_client_t){
.peer_addr = addr,
.peer_port = peer_port,
.peer_fd = peer_fd,
.peer_state = PEER_CONNECTING,
.connect_timeout_id = timeout_id,
.osd_num = peer_osd,
.in_buf = malloc(receive_buffer_size),
tfd->set_fd_handler(peer_fd, true, [this](int peer_fd, int epoll_events)
// Either OUT (connected) or HUP
void cluster_client_t::handle_connect_epoll(int peer_fd)
auto & cl = clients[peer_fd];
if (cl.connect_timeout_id >= 0)
cl.connect_timeout_id = -1;
osd_num_t peer_osd = cl.osd_num;
int result = 0;
socklen_t result_len = sizeof(result);
if (getsockopt(peer_fd, SOL_SOCKET, SO_ERROR, &result, &result_len) < 0)
result = errno;
if (result != 0)
on_connect_peer(peer_osd, -result);
int one = 1;
setsockopt(peer_fd, SOL_TCP, TCP_NODELAY, &one, sizeof(one));
cl.peer_state = PEER_CONNECTED;
tfd->set_fd_handler(peer_fd, false, [this](int peer_fd, int epoll_events)
handle_peer_epoll(peer_fd, epoll_events);
// Check OSD number
void cluster_client_t::handle_peer_epoll(int peer_fd, int epoll_events)
// Mark client as ready (i.e. some data is available)
if (epoll_events & EPOLLRDHUP)
// Stop client
printf("[OSD %lu] client %d disconnected\n", this->osd_num, peer_fd);
else if (epoll_events & EPOLLIN)
// Mark client as ready (i.e. some data is available)
auto & cl = clients[peer_fd];
if (cl.read_ready == 1)
void cluster_client_t::on_connect_peer(osd_num_t peer_osd, int peer_fd)
auto & wp =;
wp.connecting = false;
if (peer_fd < 0)
printf("Failed to connect to peer OSD %lu address %s port %d: %s\n", peer_osd, wp.cur_addr.c_str(), wp.cur_port, strerror(-peer_fd));
if (wp.address_changed)
wp.address_changed = false;
wp.address_index = 0;
else if (wp.address_index < wp.address_list.array_items().size()-1)
// Try other addresses
// Retry again in <peer_connect_interval> seconds
wp.last_connect_attempt = time(NULL);
wp.address_index = 0;
tfd->set_timer(1000*peer_connect_interval, false, [this, peer_osd](int)
printf("Connected with peer OSD %lu (fd %d)\n", peer_osd, peer_fd);
void cluster_client_t::check_peer_config(osd_client_t & cl)
osd_op_t *op = new osd_op_t();
op->op_type = OSD_OP_OUT;
op->send_list.push_back(op->req.buf, OSD_PACKET_SIZE);
op->peer_fd = cl.peer_fd;
op->req = {
.show_conf = {
.header = {
.id = this->next_subop_id++,
op->callback = [this](osd_op_t *op)
osd_client_t & cl = clients[op->peer_fd];
std::string json_err;
json11::Json config;
bool err = false;
if (op->reply.hdr.retval < 0)
err = true;
printf("Failed to get config from OSD %lu (retval=%ld), disconnecting peer\n", cl.osd_num, op->reply.hdr.retval);
config = json11::Json::parse(std::string((char*)op->buf), json_err);
if (json_err != "")
err = true;
printf("Failed to get config from OSD %lu: bad JSON: %s, disconnecting peer\n", cl.osd_num, json_err.c_str());
else if (config["osd_num"].uint64_value() != cl.osd_num)
err = true;
printf("Connected to OSD %lu instead of OSD %lu, peer state is outdated, disconnecting peer\n", config["osd_num"].uint64_value(), cl.osd_num);
on_connect_peer(cl.osd_num, -1);
if (err)
delete op;
osd_peer_fds[cl.osd_num] = cl.peer_fd;
on_connect_peer(cl.osd_num, cl.peer_fd);
delete op;
void cluster_client_t::cancel_osd_ops(osd_client_t & cl)
for (auto p: cl.sent_ops)
for (auto op: cl.outbox)
if (cl.write_op)
cl.write_op = NULL;
void cluster_client_t::cancel_out_op(osd_op_t *op)
op->reply.hdr.magic = SECONDARY_OSD_REPLY_MAGIC;
op-> = op->;
op->reply.hdr.opcode = op->req.hdr.opcode;
op->reply.hdr.retval = -EPIPE;
// Copy lambda to be unaffected by `delete op`
void cluster_client_t::stop_client(int peer_fd)
assert(peer_fd != 0);
auto it = clients.find(peer_fd);
if (it == clients.end())
uint64_t repeer_osd = 0;
osd_client_t cl = it->second;
if (cl.peer_state == PEER_CONNECTED)
if (cl.osd_num)
// Reload configuration from etcd when the connection is dropped
printf("[OSD %lu] Stopping client %d (OSD peer %lu)\n", osd_num, peer_fd, cl.osd_num);
repeer_osd = cl.osd_num;
printf("[OSD %lu] Stopping client %d (regular client)\n", osd_num, peer_fd);
tfd->set_fd_handler(peer_fd, false, NULL);
if (cl.osd_num)
// Cancel outbound operations
if (cl.read_op)
delete cl.read_op;
cl.read_op = NULL;
for (auto rit = read_ready_clients.begin(); rit != read_ready_clients.end(); rit++)
if (*rit == peer_fd)
for (auto wit = write_ready_clients.begin(); wit != write_ready_clients.end(); wit++)
if (*wit == peer_fd)
assert(peer_fd != 0);
if (repeer_osd)

cluster_client.h Normal file
View File

@ -0,0 +1,209 @@
#pragma once
#include <sys/types.h>
#include <stdint.h>
#include <arpa/inet.h>
#include <malloc.h>
#include <set>
#include <map>
#include <deque>
#include <vector>
#include "json11/json11.hpp"
#include "osd_ops.h"
#include "timerfd_manager.h"
#include "ringloop.h"
#define OSD_OP_IN 0
#define OSD_OP_OUT 1
#define CL_READ_HDR 1
#define CL_READ_DATA 2
#define CL_WRITE_READY 1
#define CL_WRITE_REPLY 2
struct osd_op_buf_list_t
int count = 0, alloc = 0, sent = 0;
iovec *buf = NULL;
iovec inline_buf[OSD_OP_INLINE_BUF_COUNT];
if (buf && buf != inline_buf)
inline iovec* get_iovec()
return (buf ? buf : inline_buf) + sent;
inline int get_size()
return count - sent;
inline void push_back(void *nbuf, size_t len)
if (count >= alloc)
if (!alloc)
buf = inline_buf;
else if (buf == inline_buf)
int old = alloc;
alloc = ((alloc/16)*16 + 1);
buf = (iovec*)malloc(sizeof(iovec) * alloc);
memcpy(buf, inline_buf, sizeof(iovec)*old);
alloc = ((alloc/16)*16 + 1);
buf = (iovec*)realloc(buf, sizeof(iovec) * alloc);
buf[count++] = { .iov_base = nbuf, .iov_len = len };
struct blockstore_op_t;
struct osd_primary_op_data_t;
struct osd_op_t
timespec tv_begin;
uint64_t op_type = OSD_OP_IN;
int peer_fd;
osd_any_op_t req;
osd_any_reply_t reply;
blockstore_op_t *bs_op = NULL;
void *buf = NULL;
void *rmw_buf = NULL;
osd_primary_op_data_t* op_data = NULL;
std::function<void(osd_op_t*)> callback;
osd_op_buf_list_t send_list;
struct osd_client_t
sockaddr_in peer_addr;
int peer_port;
int peer_fd;
int peer_state;
int connect_timeout_id = -1;
osd_num_t osd_num = 0;
void *in_buf = NULL;
// Read state
int read_ready = 0;
osd_op_t *read_op = NULL;
int read_reply_id = 0;
iovec read_iov;
msghdr read_msg;
void *read_buf = NULL;
int read_remaining = 0;
int read_state = 0;
// Outbound operations sent to this peer
std::map<int, osd_op_t*> sent_ops;
// Outbound messages (replies or requests)
std::deque<osd_op_t*> outbox;
// PGs dirtied by this client's primary-writes (FIXME to drop the connection)
std::set<pg_num_t> dirty_pgs;
// Write state
osd_op_t *write_op = NULL;
msghdr write_msg;
int write_state = 0;
struct osd_wanted_peer_t
json11::Json address_list;
int port;
time_t last_connect_attempt;
bool connecting, address_changed;
int address_index;
std::string cur_addr;
int cur_port;
struct osd_op_stats_t
uint64_t op_stat_sum[OSD_OP_MAX+1] = { 0 };
uint64_t op_stat_count[OSD_OP_MAX+1] = { 0 };
uint64_t op_stat_bytes[OSD_OP_MAX+1] = { 0 };
uint64_t subop_stat_sum[OSD_OP_MAX+1] = { 0 };
uint64_t subop_stat_count[OSD_OP_MAX+1] = { 0 };
struct cluster_client_t
timerfd_manager_t *tfd;
ring_loop_t *ringloop;
// osd_num_t is only for logging and asserts
osd_num_t osd_num;
int receive_buffer_size = 9000;
int peer_connect_interval = 5;
int peer_connect_timeout = 5;
int log_level = 0;
std::map<osd_num_t, osd_wanted_peer_t> wanted_peers;
std::map<uint64_t, int> osd_peer_fds;
uint64_t next_subop_id = 1;
std::map<int, osd_client_t> clients;
std::vector<int> read_ready_clients;
std::vector<int> write_ready_clients;
// op statistics
osd_op_stats_t stats;
// public
void connect_peer(uint64_t osd_num, json11::Json address_list, int port);
void stop_client(int peer_fd);
void outbox_push(osd_op_t *cur_op);
std::function<void(osd_op_t*)> exec_op;
std::function<void(osd_num_t)> repeer_pgs;
// private
void try_connect_peer(uint64_t osd_num);
void try_connect_peer_addr(osd_num_t peer_osd, const char *peer_host, int peer_port);
void handle_connect_epoll(int peer_fd);
void handle_peer_epoll(int peer_fd, int epoll_events);
void on_connect_peer(osd_num_t peer_osd, int peer_fd);
void check_peer_config(osd_client_t & cl);
void cancel_osd_ops(osd_client_t & cl);
void cancel_out_op(osd_op_t *op);
bool try_send(osd_client_t & cl);
void send_replies();
void handle_send(ring_data_t *data, int peer_fd);
void read_requests();
void handle_read(ring_data_t *data, int peer_fd);
void handle_finished_read(osd_client_t & cl);
void handle_op_hdr(osd_client_t *cl);
void handle_reply_hdr(osd_client_t *cl);

View File

@ -1,13 +0,0 @@
gcc -I. -E -o fio_headers.i src/fio_headers.h
rm -rf fio-copy
for i in `grep -Po 'fio/[^"]+' fio_headers.i | sort | uniq`; do
p=$(dirname $j)
mkdir -p fio-copy/$p
cp $i fio-copy/$j
rm fio_headers.i

View File

@ -1,18 +0,0 @@
#cd qemu
#debian/rules b/configure-stamp
#cd b/qemu; make qapi
gcc -I qemu/b/qemu `pkg-config glib-2.0 --cflags` \
-I qemu/include -E -o qemu_driver.i src/qemu_driver.c
rm -rf qemu-copy
for i in `grep -Po 'qemu/[^"]+' qemu_driver.i | sort | uniq`; do
p=$(dirname $j)
mkdir -p qemu-copy/$p
cp $i qemu-copy/$j
rm qemu_driver.i

@ -1 +0,0 @@
Subproject commit 5dc108754ad40d3b1d024f9bd7cca0595ef1a1db

View File

@ -8,10 +8,4 @@
// unsigned __int64 _mm_crc32_u64 (unsigned __int64 crc, unsigned __int64 v) // unsigned __int64 _mm_crc32_u64 (unsigned __int64 crc, unsigned __int64 v)
// unsigned int _mm_crc32_u8 (unsigned int crc, unsigned char v) // unsigned int _mm_crc32_u8 (unsigned int crc, unsigned char v)
#ifdef __cplusplus
extern "C" {
uint32_t crc32c(uint32_t crc, const void *buf, size_t len); uint32_t crc32c(uint32_t crc, const void *buf, size_t len);
#ifdef __cplusplus

View File

@ -1,7 +0,0 @@
sed 's/$REL/bullseye/g' < vitastor.Dockerfile > ../Dockerfile
cd ..
mkdir -p packages
sudo podman build -v `pwd`/packages:/root/packages -f Dockerfile .
rm Dockerfile

View File

@ -1,7 +0,0 @@
sed 's/$REL/buster/g' < vitastor.Dockerfile > ../Dockerfile
cd ..
mkdir -p packages
sudo podman build -v `pwd`/packages:/root/packages -f Dockerfile .
rm Dockerfile

debian/changelog vendored
View File

@ -1,17 +0,0 @@
vitastor (0.5.10-1) unstable; urgency=medium
* Bugfixes
-- Vitaliy Filippov <> Tue, 02 Feb 2021 23:01:24 +0300
vitastor (0.5.1-1) unstable; urgency=medium
* Add jerasure support
-- Vitaliy Filippov <> Sat, 05 Dec 2020 17:02:26 +0300
vitastor (0.5-1) unstable; urgency=medium
* First packaging for Debian
-- Vitaliy Filippov <> Thu, 05 Nov 2020 02:20:59 +0300

debian/compat vendored
View File

@ -1 +0,0 @@

debian/control vendored
View File

@ -1,17 +0,0 @@
Source: vitastor
Section: admin
Priority: optional
Maintainer: Vitaliy Filippov <>
Build-Depends: debhelper, liburing-dev (>= 0.6), g++ (>= 8), libstdc++6 (>= 8), linux-libc-dev, libgoogle-perftools-dev, libjerasure-dev, libgf-complete-dev
Standards-Version: 4.5.0
Rules-Requires-Root: no
Package: vitastor
Architecture: amd64
Depends: ${shlibs:Depends}, ${misc:Depends}, fio (= ${dep:fio}), qemu (= ${dep:qemu}), nodejs (>= 10), node-sprintf-js, node-ws (>= 7), libjerasure2, lp-solve
Description: Vitastor, a fast software-defined clustered block storage
Vitastor is a small, simple and fast clustered block storage (storage for VM drives),
architecturally similar to Ceph which means strong consistency, primary-replication,
symmetric clustering and automatic data distribution over any number of drives of any
size with configurable redundancy (replication or erasure codes/XOR).

debian/copyright vendored
View File

@ -1,21 +0,0 @@
Upstream-Name: vitastor
Upstream-Contact: Vitaliy Filippov <>
Files: *
Copyright: 2019+ Vitaliy Filippov <>
License: Multiple licenses VNPL-1.1 and/or GPL-2.0+
All server-side code (OSD, Monitor and so on) is licensed under the terms of
Vitastor Network Public License 1.1 (VNPL 1.1), a copyleft license based on
GNU GPLv3.0 with the additional "Network Interaction" clause which requires
opensourcing all programs directly or indirectly interacting with Vitastor
through a computer network and expressly designed to be used in conjunction
with it ("Proxy Programs"). Proxy Programs may be made public not only under
the terms of the same license, but also under the terms of any GPL-Compatible
Free Software License, as listed by the Free Software Foundation.
This is a stricter copyleft license than the Affero GPL.
Client libraries (cluster_client and so on) are dual-licensed under the same
VNPL 1.1 and also GNU GPL 2.0 or later to allow for compatibility with GPLed
software like QEMU and fio.

debian/install vendored
View File

@ -1,3 +0,0 @@
VNPL-1.1.txt usr/share/doc/vitastor
GPL-2.0.txt usr/share/doc/vitastor
mon usr/lib/vitastor

View File

@ -1,44 +0,0 @@
# Build patched QEMU for Debian Buster or Bullseye/Sid inside a container
# cd ..; podman build --build-arg REL=bullseye -v `pwd`/packages:/root/packages -f debian/patched-qemu.Dockerfile .
FROM debian:$REL
RUN if [ "$REL" = "buster" ]; then \
echo 'deb buster-backports main' >> /etc/apt/sources.list; \
echo >> /etc/apt/preferences; \
echo 'Package: *' >> /etc/apt/preferences; \
echo 'Pin: release a=buster-backports' >> /etc/apt/preferences; \
echo 'Pin-Priority: 500' >> /etc/apt/preferences; \
fi; \
grep '^deb ' /etc/apt/sources.list | perl -pe 's/^deb/deb-src/' >> /etc/apt/sources.list; \
echo 'APT::Install-Recommends false;' >> /etc/apt/apt.conf; \
echo 'APT::Install-Suggests false;' >> /etc/apt/apt.conf
RUN apt-get update
RUN apt-get -y install qemu fio liburing1 liburing-dev libgoogle-perftools-dev devscripts
RUN apt-get -y build-dep qemu
RUN apt-get -y build-dep fio
RUN apt-get --download-only source qemu
RUN apt-get --download-only source fio
ADD qemu-5.0-vitastor.patch qemu-5.1-vitastor.patch /root/vitastor/
RUN set -e; \
mkdir -p /root/packages/qemu-$REL; \
rm -rf /root/packages/qemu-$REL/*; \
cd /root/packages/qemu-$REL; \
dpkg-source -x /root/qemu*.dsc; \
if [ -d /root/packages/qemu-$REL/qemu-5.0 ]; then \
cp /root/vitastor/qemu-5.0-vitastor.patch /root/packages/qemu-$REL/qemu-5.0/debian/patches; \
echo qemu-5.0-vitastor.patch >> /root/packages/qemu-$REL/qemu-5.0/debian/patches/series; \
else \
cp /root/vitastor/qemu-5.1-vitastor.patch /root/packages/qemu-$REL/qemu-*/debian/patches; \
P=`ls -d /root/packages/qemu-$REL/qemu-*/debian/patches`; \
echo qemu-5.1-vitastor.patch >> $P/series; \
fi; \
cd /root/packages/qemu-$REL/qemu-*/; \
V=$(head -n1 debian/changelog | perl -pe 's/^.*\((.*?)(~bpo[\d\+]*)?\).*$/$1/')+vitastor1; \
DEBFULLNAME="Vitaliy Filippov <>" dch -D $REL -v $V 'Plug Vitastor block driver'; \
DEB_BUILD_OPTIONS=nocheck dpkg-buildpackage --jobs=auto -sa; \
rm -rf /root/packages/qemu-$REL/qemu-*/

debian/rules vendored
View File

@ -1,9 +0,0 @@
#!/usr/bin/make -f
export DH_VERBOSE = 1
dh $@
cat debian/substvars >> debian/vitastor.substvars

View File

@ -1 +0,0 @@
3.0 (quilt)

debian/substvars vendored
View File

@ -1,2 +0,0 @@

View File

@ -1,67 +0,0 @@
# Build Vitastor packages for Debian Buster or Bullseye/Sid inside a container
# cd ..; podman build --build-arg REL=bullseye -v `pwd`/packages:/root/packages -f debian/vitastor.Dockerfile .
FROM debian:$REL
RUN if [ "$REL" = "buster" ]; then \
echo 'deb buster-backports main' >> /etc/apt/sources.list; \
echo >> /etc/apt/preferences; \
echo 'Package: *' >> /etc/apt/preferences; \
echo 'Pin: release a=buster-backports' >> /etc/apt/preferences; \
echo 'Pin-Priority: 500' >> /etc/apt/preferences; \
fi; \
grep '^deb ' /etc/apt/sources.list | perl -pe 's/^deb/deb-src/' >> /etc/apt/sources.list; \
echo 'APT::Install-Recommends false;' >> /etc/apt/apt.conf; \
echo 'APT::Install-Suggests false;' >> /etc/apt/apt.conf
RUN apt-get update
RUN apt-get -y install qemu fio liburing1 liburing-dev libgoogle-perftools-dev devscripts
RUN apt-get -y build-dep qemu
RUN apt-get -y build-dep fio
RUN apt-get --download-only source qemu
RUN apt-get --download-only source fio
RUN apt-get -y install libjerasure-dev cmake
ADD . /root/vitastor
RUN set -e -x; \
mkdir -p /root/fio-build/; \
cd /root/fio-build/; \
rm -rf /root/fio-build/*; \
dpkg-source -x /root/fio*.dsc; \
cd /root/packages/qemu-$REL/; \
rm -rf qemu*/; \
dpkg-source -x qemu*.dsc; \
cd /root/packages/qemu-$REL/qemu*/; \
debian/rules b/configure-stamp; \
cd b/qemu; \
make -j8 qapi/qapi-builtin-types.h; \
mkdir -p /root/packages/vitastor-$REL; \
rm -rf /root/packages/vitastor-$REL/*; \
cd /root/packages/vitastor-$REL; \
cp -r /root/vitastor vitastor-0.5.10; \
ln -s /root/packages/qemu-$REL/qemu-*/ vitastor-0.5.10/qemu; \
ln -s /root/fio-build/fio-*/ vitastor-0.5.10/fio; \
cd vitastor-0.5.10; \
FIO=$(head -n1 fio/debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
QEMU=$(head -n1 qemu/debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
sh; \
sh; \
rm qemu fio; \
mkdir -p a b debian/patches; \
mv qemu-copy b/qemu; \
mv fio-copy b/fio; \
diff -NaurpbB a b > debian/patches/qemu-fio-headers.patch || true; \
echo qemu-fio-headers.patch >> debian/patches/series; \
rm -rf a b; \
rm -rf /root/packages/qemu-$REL/qemu*/; \
echo "dep:fio=$FIO" > debian/substvars; \
echo "dep:qemu=$QEMU" >> debian/substvars; \
cd /root/packages/vitastor-$REL; \
tar --sort=name --mtime='2020-01-01' --owner=0 --group=0 --exclude=debian -cJf vitastor_0.5.10.orig.tar.xz vitastor-0.5.10; \
cd vitastor-0.5.10; \
V=$(head -n1 debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
DEBFULLNAME="Vitaliy Filippov <>" dch -D $REL -v "$V""$REL" "Rebuild for $REL"; \
DEB_BUILD_OPTIONS=nocheck dpkg-buildpackage --jobs=auto -sa; \
rm -rf /root/packages/vitastor-$REL/vitastor-*/

dump_journal.cpp Normal file
View File

@ -0,0 +1,165 @@
#include <sys/types.h>
#include <sys/ioctl.h>
#include <sys/stat.h>
#include <sys/time.h>
#include <fcntl.h>
#include <unistd.h>
#include <stdint.h>
#include <malloc.h>
#include <linux/fs.h>
#include <string.h>
#include <errno.h>
#include <assert.h>
#include <stdio.h>
#include "blockstore_impl.h"
#include "crc32c.h"
struct journal_dump_t
char *journal_device;
uint32_t journal_block;
uint64_t journal_offset;
uint64_t journal_len;
uint64_t journal_pos;
int fd;
void dump_block(void *buf);
int main(int argc, char *argv[])
if (argc < 5)
printf("USAGE: %s <journal_file> <journal_block_size> <offset> <size>\n", argv[0]);
return 1;
journal_dump_t self;
self.journal_device = argv[1];
self.journal_block = strtoul(argv[2], NULL, 10);
self.journal_offset = strtoull(argv[3], NULL, 10);
self.journal_len = strtoull(argv[4], NULL, 10);
if (self.journal_block < MEM_ALIGNMENT || (self.journal_block % MEM_ALIGNMENT) ||
self.journal_block > 128*1024)
printf("Invalid journal block size\n");
return 1;
self.fd = open(self.journal_device, O_DIRECT|O_RDONLY);
if (self.fd == -1)
printf("Failed to open journal\n");
return 1;
void *data = memalign(MEM_ALIGNMENT, self.journal_block);
self.journal_pos = 0;
while (self.journal_pos < self.journal_len)
int r = pread(self.fd, data, self.journal_block, self.journal_offset+self.journal_pos);
assert(r == self.journal_block);
uint64_t s;
for (s = 0; s < self.journal_block; s += 8)
if (*((uint64_t*)(data+s)) != 0)
if (s == self.journal_block)
printf("offset %08lx: zeroes\n", self.journal_pos);
self.journal_pos += self.journal_block;
else if (((journal_entry*)data)->magic == JOURNAL_MAGIC)
printf("offset %08lx:\n", self.journal_pos);
printf("offset %08lx: no magic in the beginning, looks like random data (pattern=%lx)\n", self.journal_pos, *((uint64_t*)data));
self.journal_pos += self.journal_block;
return 0;
void journal_dump_t::dump_block(void *buf)
uint32_t pos = 0;
journal_pos += journal_block;
int entry = 0;
bool wrapped = false;
while (pos < journal_block)
journal_entry *je = (journal_entry*)(buf + pos);
if (je->magic != JOURNAL_MAGIC || je->type < JE_START || je->type > JE_DELETE)
const char *crc32_valid = je_crc32(je) == je->crc32 ? "(valid)" : "(invalid)";
printf("entry % 3d: crc32=%08x %s prev=%08x ", entry, je->crc32, crc32_valid, je->crc32_prev);
if (je->type == JE_START)
printf("je_start start=%08lx\n", je->start.journal_start);
else if (je->type == JE_SMALL_WRITE)
"je_small_write oid=%lu:%lu ver=%lu offset=%u len=%u loc=%08lx",
je->small_write.oid.inode, je->small_write.oid.stripe,
je->small_write.version, je->small_write.offset, je->small_write.len,
if (journal_pos + je->small_write.len > journal_len)
// data continues from the beginning of the journal
journal_pos = journal_block;
wrapped = true;
if (journal_pos != je->small_write.data_offset)
printf(" (mismatched, calculated = %lu)", journal_pos);
journal_pos += je->small_write.len;
if (journal_pos >= journal_len)
journal_pos = journal_block;
wrapped = true;
uint32_t data_crc32 = 0;
void *data = memalign(MEM_ALIGNMENT, je->small_write.len);
assert(pread(fd, data, je->small_write.len, journal_offset+je->small_write.data_offset) == je->small_write.len);
data_crc32 = crc32c(0, data, je->small_write.len);
" data_crc32=%08x%s", je->small_write.crc32_data,
(data_crc32 != je->small_write.crc32_data) ? " (invalid)" : " (valid)"
else if (je->type == JE_BIG_WRITE)
printf("je_big_write oid=%lu:%lu ver=%lu loc=%08lx\n", je->big_write.oid.inode, je->big_write.oid.stripe, je->big_write.version, je->big_write.location);
else if (je->type == JE_STABLE)
printf("je_stable oid=%lu:%lu ver=%lu\n", je->stable.oid.inode, je->stable.oid.stripe, je->stable.version);
else if (je->type == JE_ROLLBACK)
printf("je_rollback oid=%lu:%lu ver=%lu\n", je->rollback.oid.inode, je->rollback.oid.stripe, je->rollback.version);
else if (je->type == JE_DELETE)
printf("je_delete oid=%lu:%lu ver=%lu\n", je->del.oid.inode, je->del.oid.stripe, je->del.version);
pos += je->size;
if (wrapped)
journal_pos = journal_len;

etcd_state_client.cpp Normal file
View File

@ -0,0 +1,374 @@
#include "osd_ops.h"
#include "pg_states.h"
#include "etcd_state_client.h"
#include "http_client.h"
#include "base64.h"
json_kv_t etcd_state_client_t::parse_etcd_kv(const json11::Json & kv_json)
json_kv_t kv;
kv.key = base64_decode(kv_json["key"].string_value());
std::string json_err, json_text = base64_decode(kv_json["value"].string_value());
kv.value = json_text == "" ? json11::Json() : json11::Json::parse(json_text, json_err);
if (json_err != "")
printf("Bad JSON in etcd key %s: %s (value: %s)\n", kv.key.c_str(), json_err.c_str(), json_text.c_str());
kv.key = "";
return kv;
void etcd_state_client_t::etcd_txn(json11::Json txn, int timeout, std::function<void(std::string, json11::Json)> callback)
etcd_call("/kv/txn", txn, timeout, callback);
void etcd_state_client_t::etcd_call(std::string api, json11::Json payload, int timeout, std::function<void(std::string, json11::Json)> callback)
std::string etcd_address = etcd_addresses[rand() % etcd_addresses.size()];
std::string etcd_api_path;
int pos = etcd_address.find('/');
if (pos >= 0)
etcd_api_path = etcd_address.substr(pos);
etcd_address = etcd_address.substr(0, pos);
std::string req = payload.dump();
req = "POST "+etcd_api_path+api+" HTTP/1.1\r\n"
"Host: "+etcd_address+"\r\n"
"Content-Type: application/json\r\n"
"Content-Length: "+std::to_string(req.size())+"\r\n"
"Connection: close\r\n"
http_request_json(tfd, etcd_address, req, timeout, callback);
void etcd_state_client_t::start_etcd_watcher()
std::string etcd_address = etcd_addresses[rand() % etcd_addresses.size()];
std::string etcd_api_path;
int pos = etcd_address.find('/');
if (pos >= 0)
etcd_api_path = etcd_address.substr(pos);
etcd_address = etcd_address.substr(0, pos);
etcd_watches_initialised = 0;
etcd_watch_ws = open_websocket(tfd, etcd_address, etcd_api_path+"/watch", ETCD_SLOW_TIMEOUT, [this](const http_response_t *msg)
if (msg->body.length())
std::string json_err;
json11::Json data = json11::Json::parse(msg->body, json_err);
if (json_err != "")
printf("Bad JSON in etcd event: %s, ignoring event\n", json_err.c_str());
if (data["result"]["created"].bool_value())
if (etcd_watches_initialised == 4)
etcd_watch_revision = data["result"]["header"]["revision"].uint64_value();
// First gather all changes into a hash to remove multiple overwrites
json11::Json::object changes;
for (auto & ev: data["result"]["events"].array_items())
auto kv = parse_etcd_kv(ev["kv"]);
if (kv.key != "")
changes[kv.key] = kv.value;
for (auto & kv: changes)
if (this->log_level > 0)
printf("Incoming event: %s -> %s\n", kv.first.c_str(), kv.second.dump().c_str());
parse_state(kv.first, kv.second);
// React to changes
if (msg->eof)
etcd_watch_ws = NULL;
if (etcd_watches_initialised == 0)
// Connection not established, retry in <ETCD_SLOW_TIMEOUT>
tfd->set_timer(ETCD_SLOW_TIMEOUT, false, [this](int)
// Connection was live, retry immediately
etcd_watch_ws->post_message(WS_TEXT, json11::Json(json11::Json::object {
{ "create_request", json11::Json::object {
{ "key", base64_encode(etcd_prefix+"/config/") },
{ "range_end", base64_encode(etcd_prefix+"/config0") },
{ "start_revision", etcd_watch_revision+1 },
{ "watch_id", ETCD_CONFIG_WATCH_ID },
} }
etcd_watch_ws->post_message(WS_TEXT, json11::Json(json11::Json::object {
{ "create_request", json11::Json::object {
{ "key", base64_encode(etcd_prefix+"/osd/state/") },
{ "range_end", base64_encode(etcd_prefix+"/osd/state0") },
{ "start_revision", etcd_watch_revision+1 },
{ "watch_id", ETCD_OSD_STATE_WATCH_ID },
} }
etcd_watch_ws->post_message(WS_TEXT, json11::Json(json11::Json::object {
{ "create_request", json11::Json::object {
{ "key", base64_encode(etcd_prefix+"/pg/state/") },
{ "range_end", base64_encode(etcd_prefix+"/pg/state0") },
{ "start_revision", etcd_watch_revision+1 },
{ "watch_id", ETCD_PG_STATE_WATCH_ID },
} }
etcd_watch_ws->post_message(WS_TEXT, json11::Json(json11::Json::object {
{ "create_request", json11::Json::object {
{ "key", base64_encode(etcd_prefix+"/pg/history/") },
{ "range_end", base64_encode(etcd_prefix+"/pg/history0") },
{ "start_revision", etcd_watch_revision+1 },
{ "watch_id", ETCD_PG_HISTORY_WATCH_ID },
} }
void etcd_state_client_t::load_global_config()
etcd_call("/kv/range", json11::Json::object {
{ "key", base64_encode(etcd_prefix+"/config/global") }
}, ETCD_SLOW_TIMEOUT, [this](std::string err, json11::Json data)
if (err != "")
printf("Error reading OSD configuration from etcd: %s\n", err.c_str());
tfd->set_timer(ETCD_SLOW_TIMEOUT, false, [this](int timer_id)
if (!etcd_watch_revision)
etcd_watch_revision = data["header"]["revision"].uint64_value();
json11::Json::object global_config;
if (data["kvs"].array_items().size() > 0)
auto kv = parse_etcd_kv(data["kvs"][0]);
if (kv.value.is_object())
global_config = kv.value.object_items();
void etcd_state_client_t::load_pgs()
json11::Json::array txn = {
json11::Json::object {
{ "request_range", json11::Json::object {
{ "key", base64_encode(etcd_prefix+"/config/pgs") },
} }
json11::Json::object {
{ "request_range", json11::Json::object {
{ "key", base64_encode(etcd_prefix+"/pg/history/") },
{ "range_end", base64_encode(etcd_prefix+"/pg/history0") },
} }
json11::Json::object {
{ "request_range", json11::Json::object {
{ "key", base64_encode(etcd_prefix+"/pg/state/") },
{ "range_end", base64_encode(etcd_prefix+"/pg/state0") },
} }
json11::Json::object {
{ "request_range", json11::Json::object {
{ "key", base64_encode(etcd_prefix+"/osd/state/") },
{ "range_end", base64_encode(etcd_prefix+"/osd/state0") },
} }
json11::Json::object req = { { "success", txn } };
json11::Json checks = load_pgs_checks_hook();
if (checks.array_items().size() > 0)
req["compare"] = checks;
etcd_txn(req, ETCD_SLOW_TIMEOUT, [this](std::string err, json11::Json data)
if (err != "")
printf("Error loading PGs from etcd: %s\n", err.c_str());
tfd->set_timer(ETCD_SLOW_TIMEOUT, false, [this](int timer_id)
if (!data["succeeded"].bool_value())
for (auto & res: data["responses"].array_items())
for (auto & kv_json: res["response_range"]["kvs"].array_items())
auto kv = parse_etcd_kv(kv_json);
parse_state(kv.key, kv.value);
void etcd_state_client_t::parse_state(const std::string & key, const json11::Json & value)
if (key == etcd_prefix+"/config/pgs")
for (auto & pg_item: this->pg_config)
pg_item.second.exists = false;
for (auto & pg_item: value["items"].object_items())
pg_num_t pg_num = stoull_full(pg_item.first);
if (!pg_num)
printf("Bad key in PG configuration: %s (must be a number), skipped\n", pg_item.first.c_str());
this->pg_config[pg_num].exists = true;
this->pg_config[pg_num].pause = pg_item.second["pause"].bool_value();
this->pg_config[pg_num].primary = pg_item.second["primary"].uint64_value();
for (auto pg_osd: pg_item.second["osd_set"].array_items())
if (this->pg_config[pg_num].target_set.size() != 3)
printf("Bad PG %u config format: incorrect osd_set = %s\n", pg_num, pg_item.second["osd_set"].dump().c_str());
this->pg_config[pg_num].pause = true;
else if (key.substr(0, etcd_prefix.length()+12) == etcd_prefix+"/pg/history/")
// <etcd_prefix>/pg/history/%d
pg_num_t pg_num = stoull_full(key.substr(etcd_prefix.length()+12));
if (!pg_num)
printf("Bad etcd key %s, ignoring\n", key.c_str());
auto & pg_cfg = this->pg_config[pg_num];
// Refuse to start PG if any set of the <osd_sets> has no live OSDs
for (auto hist_item: value["osd_sets"].array_items())
std::vector<osd_num_t> history_set;
for (auto pg_osd: hist_item.array_items())
// Include these additional OSDs when peering the PG
for (auto pg_osd: value["all_peers"].array_items())
else if (key.substr(0, etcd_prefix.length()+10) == etcd_prefix+"/pg/state/")
// <etcd_prefix>/pg/state/%d
pg_num_t pg_num = stoull_full(key.substr(etcd_prefix.length()+10));
if (!pg_num)
printf("Bad etcd key %s, ignoring\n", key.c_str());
else if (value.is_null())
this->pg_config[pg_num].cur_primary = 0;
this->pg_config[pg_num].cur_state = 0;
osd_num_t cur_primary = value["primary"].uint64_value();
int state = 0;
for (auto & e: value["state"].array_items())
int i;
for (i = 0; i < pg_state_bit_count; i++)
if (e.string_value() == pg_state_names[i])
state = state | pg_state_bits[i];
if (i >= pg_state_bit_count)
printf("Unexpected PG %u state keyword in etcd: %s\n", pg_num, e.dump().c_str());
if (!cur_primary || !value["state"].is_array() || !state ||
(state & PG_OFFLINE) && state != PG_OFFLINE ||
(state & PG_PEERING) && state != PG_PEERING ||
(state & PG_INCOMPLETE) && state != PG_INCOMPLETE)
printf("Unexpected PG %u state in etcd: primary=%lu, state=%s\n", pg_num, cur_primary, value["state"].dump().c_str());
this->pg_config[pg_num].cur_primary = cur_primary;
this->pg_config[pg_num].cur_state = state;
else if (key.substr(0, etcd_prefix.length()+11) == etcd_prefix+"/osd/state/")
// <etcd_prefix>/osd/state/%d
osd_num_t peer_osd = std::stoull(key.substr(etcd_prefix.length()+11));
if (peer_osd > 0)
if (value.is_object() && value["state"] == "up" &&
value["addresses"].is_array() &&
value["port"].int64_value() > 0 && value["port"].int64_value() < 65536)
this->peer_states[peer_osd] = value;
if (on_change_osd_state_hook != NULL)

View File

@ -1,9 +1,5 @@
// Copyright (c) Vitaliy Filippov, 2019+
// License: VNPL-1.1 or GNU GPL-2.0+ (see for details)
#pragma once #pragma once
#include "osd_id.h"
#include "http_client.h" #include "http_client.h"
#include "timerfd_manager.h" #include "timerfd_manager.h"
@ -16,14 +12,6 @@
#define ETCD_SLOW_TIMEOUT 5000 #define ETCD_SLOW_TIMEOUT 5000
#define DEFAULT_BLOCK_SIZE 128*1024
struct json_kv_t
std::string key;
json11::Json value;
struct pg_config_t struct pg_config_t
{ {
bool exists; bool exists;
@ -34,47 +22,16 @@ struct pg_config_t
bool pause; bool pause;
osd_num_t cur_primary; osd_num_t cur_primary;
int cur_state; int cur_state;
uint64_t epoch;
}; };
struct pool_config_t struct json_kv_t
{ {
bool exists; std::string key;
pool_id_t id; json11::Json value;
std::string name;
uint64_t scheme;
uint64_t pg_size, pg_minsize, parity_chunks;
uint64_t pg_count;
uint64_t real_pg_count;
std::string failure_domain;
uint64_t max_osd_combinations;
uint64_t pg_stripe_size;
std::map<pg_num_t, pg_config_t> pg_config;
struct inode_config_t
uint64_t num;
std::string name;
uint64_t size;
inode_t parent_id;
bool readonly;
struct inode_watch_t
std::string name;
inode_config_t cfg;
}; };
struct etcd_state_client_t struct etcd_state_client_t
{ {
std::vector<inode_watch_t*> watches;
websocket_t *etcd_watch_ws = NULL;
uint64_t bs_block_size = 0;
void add_etcd_url(std::string);
std::vector<std::string> etcd_addresses; std::vector<std::string> etcd_addresses;
std::string etcd_prefix; std::string etcd_prefix;
int log_level = 0; int log_level = 0;
@ -82,17 +39,15 @@ public:
int etcd_watches_initialised = 0; int etcd_watches_initialised = 0;
uint64_t etcd_watch_revision = 0; uint64_t etcd_watch_revision = 0;
std::map<pool_id_t, pool_config_t> pool_config; websocket_t *etcd_watch_ws = NULL;
std::map<pg_num_t, pg_config_t> pg_config;
std::map<osd_num_t, json11::Json> peer_states; std::map<osd_num_t, json11::Json> peer_states;
std::map<inode_t, inode_config_t> inode_config;
std::map<std::string, inode_t> inode_by_name;
std::function<void(json11::Json::object &)> on_change_hook; std::function<void(json11::Json::object &)> on_change_hook;
std::function<void(json11::Json::object &)> on_load_config_hook; std::function<void(json11::Json::object &)> on_load_config_hook;
std::function<json11::Json()> load_pgs_checks_hook; std::function<json11::Json()> load_pgs_checks_hook;
std::function<void(bool)> on_load_pgs_hook; std::function<void(bool)> on_load_pgs_hook;
std::function<void(pool_id_t, pg_num_t)> on_change_pg_history_hook; std::function<void(uint64_t)> on_change_osd_state_hook;
std::function<void(osd_num_t)> on_change_osd_state_hook;
json_kv_t parse_etcd_kv(const json11::Json & kv_json); json_kv_t parse_etcd_kv(const json11::Json & kv_json);
void etcd_call(std::string api, json11::Json payload, int timeout, std::function<void(std::string, json11::Json)> callback); void etcd_call(std::string api, json11::Json payload, int timeout, std::function<void(std::string, json11::Json)> callback);
@ -101,8 +56,4 @@ public:
void load_global_config(); void load_global_config();
void load_pgs(); void load_pgs();
void parse_state(const std::string & key, const json11::Json & value); void parse_state(const std::string & key, const json11::Json & value);
void parse_config(json11::Json & config);
inode_watch_t* watch_inode(std::string name);
void close_watch(inode_watch_t* watch);
}; };

View File

@ -1,6 +1,3 @@
// Copyright (c) Vitaliy Filippov, 2019+
// License: VNPL-1.1 (see for details)
// FIO engine to test Blockstore // FIO engine to test Blockstore
// //
// Initialize storage for tests: // Initialize storage for tests:
@ -25,7 +22,12 @@
// -bs_config='{"data_device":"./test_data.bin"}' -size=1000M // -bs_config='{"data_device":"./test_data.bin"}' -size=1000M
#include "blockstore.h" #include "blockstore.h"
#include "fio_headers.h" extern "C" {
#include "fio/fio.h"
#include "fio/optgroup.h"
#include "json11/json11.hpp" #include "json11/json11.hpp"
@ -288,7 +290,7 @@ static int bs_invalidate(struct thread_data *td, struct fio_file *f)
} }
struct ioengine_ops ioengine = { struct ioengine_ops ioengine = {
.name = "vitastor_blockstore", .name = "microceph_blockstore",
.setup = bs_setup, .setup = bs_setup,

View File

@ -1,6 +1,3 @@
// Copyright (c) Vitaliy Filippov, 2019+
// License: VNPL-1.1 or GNU GPL-2.0+ (see for details)
// FIO engine to test Blockstore through Secondary OSD interface // FIO engine to test Blockstore through Secondary OSD interface
// //
// Prepare storage like in fio_engine.cpp, then start OSD with ./osd, then test it // Prepare storage like in fio_engine.cpp, then start OSD with ./osd, then test it
@ -8,7 +5,7 @@
// Random write: // Random write:
// //
// fio -thread -ioengine=./ -name=test -bs=4k -direct=1 -fsync=16 -iodepth=16 -rw=randwrite \ // fio -thread -ioengine=./ -name=test -bs=4k -direct=1 -fsync=16 -iodepth=16 -rw=randwrite \
// -host= -port=11203 [-block_size_order=17] [-single_primary=1] -size=1000M // -host= -port=11203 [-single_primary=1] -size=1000M
// //
// Linear write: // Linear write:
// //
@ -30,7 +27,12 @@
#include "rw_blocking.h" #include "rw_blocking.h"
#include "osd_ops.h" #include "osd_ops.h"
#include "fio_headers.h" extern "C" {
#include "fio/fio.h"
#include "fio/optgroup.h"
struct sec_data struct sec_data
{ {
@ -51,7 +53,6 @@ struct sec_options
int port = 0; int port = 0;
int single_primary = 0; int single_primary = 0;
int trace = 0; int trace = 0;
int block_order = 17;
}; };
static struct fio_option options[] = { static struct fio_option options[] = {
@ -73,15 +74,6 @@ static struct fio_option options[] = {
.category = FIO_OPT_C_ENGINE, .category = FIO_OPT_C_ENGINE,
}, },
.name = "block_size_order",
.lname = "Blockstore block size order",
.type = FIO_OPT_INT,
.off1 = offsetof(struct sec_options, block_order),
.help = "Blockstore block size order (size = 2^order)",
.category = FIO_OPT_C_ENGINE,
{ {
.name = "single_primary", .name = "single_primary",
.lname = "Single Primary", .lname = "Single Primary",
@ -140,7 +132,6 @@ static void sec_cleanup(struct thread_data *td)
if (bsd) if (bsd)
{ {
close(bsd->connect_fd); close(bsd->connect_fd);
delete bsd;
} }
} }
@ -149,8 +140,6 @@ static int sec_init(struct thread_data *td)
{ {
sec_options *o = (sec_options*)td->eo; sec_options *o = (sec_options*)td->eo;
sec_data *bsd = (sec_data*)td->io_ops_data; sec_data *bsd = (sec_data*)td->io_ops_data;
bsd->block_order = o->block_order == 0 ? 17 : o->block_order;
bsd->block_size = 1 << o->block_order;
struct sockaddr_in addr; struct sockaddr_in addr;
int r; int r;
@ -204,7 +193,7 @@ static enum fio_q_status sec_queue(struct thread_data *td, struct io_u *io)
if (!opt->single_primary) if (!opt->single_primary)
{ {
op.hdr.opcode = OSD_OP_SEC_READ; op.hdr.opcode = OSD_OP_SECONDARY_READ;
op.sec_rw.oid = { op.sec_rw.oid = {
.inode = 1, .inode = 1,
.stripe = io->offset >> bsd->block_order, .stripe = io->offset >> bsd->block_order,
@ -225,7 +214,7 @@ static enum fio_q_status sec_queue(struct thread_data *td, struct io_u *io)
if (!opt->single_primary) if (!opt->single_primary)
{ {
op.hdr.opcode = OSD_OP_SEC_WRITE; op.hdr.opcode = OSD_OP_SECONDARY_WRITE;
op.sec_rw.oid = { op.sec_rw.oid = {
.inode = 1, .inode = 1,
.stripe = io->offset >> bsd->block_order, .stripe = io->offset >> bsd->block_order,
@ -313,7 +302,6 @@ static int sec_getevents(struct thread_data *td, unsigned int min, unsigned int
exit(1); exit(1);
} }
io_u* io = it->second; io_u* io = it->second;
if (io->ddir == DDIR_READ) if (io->ddir == DDIR_READ)
{ {
if (reply.hdr.retval != io->xfer_buflen) if (reply.hdr.retval != io->xfer_buflen)
@ -381,7 +369,7 @@ static int sec_invalidate(struct thread_data *td, struct fio_file *f)
} }
struct ioengine_ops ioengine = { struct ioengine_ops ioengine = {
.name = "vitastor_secondary_osd", .name = "microceph_secondary_osd",
.setup = sec_setup, .setup = sec_setup,

View File

@ -1,6 +1,3 @@
// Copyright (c) Vitaliy Filippov, 2019+
// License: VNPL-1.1 or GNU GPL-2.0+ (see for details)
#include <netinet/tcp.h> #include <netinet/tcp.h>
#include <sys/epoll.h> #include <sys/epoll.h>
@ -13,8 +10,6 @@
#include <fcntl.h> #include <fcntl.h>
#include <string.h> #include <string.h>
#include <stdexcept>
#include "json11/json11.hpp" #include "json11/json11.hpp"
#include "http_client.h" #include "http_client.h"
#include "timerfd_manager.h" #include "timerfd_manager.h"
@ -22,6 +17,7 @@
#define READ_BUFFER_SIZE 9000 #define READ_BUFFER_SIZE 9000
static int extract_port(std::string & host); static int extract_port(std::string & host);
static std::string strtolower(const std::string & in);
static std::string trim(const std::string & in); static std::string trim(const std::string & in);
static std::string ws_format_frame(int type, uint64_t size); static std::string ws_format_frame(int type, uint64_t size);
static bool ws_parse_frame(std::string & buf, int & type, std::string & res); static bool ws_parse_frame(std::string & buf, int & type, std::string & res);
@ -54,15 +50,8 @@ struct http_co_t
websocket_t ws; websocket_t ws;
int onstack = 0;
bool ended = false;
~http_co_t(); ~http_co_t();
inline void stackin() { onstack++; }
inline void stackout() { onstack--; if (!onstack && ended) end(); }
inline void end() { ended = true; if (!onstack) { delete this; } }
void start_connection(); void start_connection();
void handle_events();
void handle_connect_result(); void handle_connect_result();
void submit_read(); void submit_read();
void submit_send(); void submit_send();
@ -148,7 +137,7 @@ void websocket_t::post_message(int type, const std::string & msg)
void websocket_t::close() void websocket_t::close()
{ {
co->end(); delete co;
} }
http_co_t::~http_co_t() http_co_t::~http_co_t()
@ -184,15 +173,14 @@ http_co_t::~http_co_t()
void http_co_t::start_connection() void http_co_t::start_connection()
{ {
int port = extract_port(host); int port = extract_port(host);
struct sockaddr_in addr; struct sockaddr_in addr;
int r; int r;
if ((r = inet_pton(AF_INET, host.c_str(), &addr.sin_addr)) != 1) if ((r = inet_pton(AF_INET, host.c_str(), &addr.sin_addr)) != 1)
{ {
parsed.error_code = ENXIO; parsed.error_code = ENXIO;
stackout(); // FIXME 'delete this' is ugly...
end(); delete this;
return; return;
} }
addr.sin_family = AF_INET; addr.sin_family = AF_INET;
@ -201,8 +189,7 @@ void http_co_t::start_connection()
if (peer_fd < 0) if (peer_fd < 0)
{ {
parsed.error_code = errno; parsed.error_code = errno;
stackout(); delete this;
return; return;
} }
fcntl(peer_fd, F_SETFL, fcntl(peer_fd, F_GETFL, 0) | O_NONBLOCK); fcntl(peer_fd, F_SETFL, fcntl(peer_fd, F_GETFL, 0) | O_NONBLOCK);
@ -214,86 +201,69 @@ void http_co_t::start_connection()
{ {
parsed.error_code = ETIME; parsed.error_code = ETIME;
} }
end(); delete this;
}); });
} }
tfd->set_fd_handler(peer_fd, true, [this](int peer_fd, int epoll_events)
this->epoll_events |= epoll_events;
epoll_events = 0; epoll_events = 0;
// Finally call connect // Finally call connect
r = ::connect(peer_fd, (sockaddr*)&addr, sizeof(addr)); r = ::connect(peer_fd, (sockaddr*)&addr, sizeof(addr));
if (r < 0 && errno != EINPROGRESS) if (r < 0 && errno != EINPROGRESS)
{ {
parsed.error_code = errno; parsed.error_code = errno;
stackout(); delete this;
return; return;
} }
tfd->set_fd_handler(peer_fd, true, [this](int peer_fd, int epoll_events)
this->epoll_events |= epoll_events;
void http_co_t::handle_events()
while (epoll_events)
if (state == HTTP_CO_CONNECTING)
epoll_events &= ~EPOLLOUT;
if (epoll_events & EPOLLIN)
else if (epoll_events & (EPOLLRDHUP|EPOLLERR))
} }
void http_co_t::handle_connect_result() void http_co_t::handle_connect_result()
{ {
stackin(); if (epoll_events & (EPOLLOUT | EPOLLERR))
int result = 0;
socklen_t result_len = sizeof(result);
if (getsockopt(peer_fd, SOL_SOCKET, SO_ERROR, &result, &result_len) < 0)
{ {
result = errno; int result = 0;
socklen_t result_len = sizeof(result);
if (getsockopt(peer_fd, SOL_SOCKET, SO_ERROR, &result, &result_len) < 0)
result = errno;
if (result != 0)
parsed.error_code = result;
delete this;
int one = 1;
setsockopt(peer_fd, SOL_TCP, TCP_NODELAY, &one, sizeof(one));
tfd->set_fd_handler(peer_fd, false, [this](int peer_fd, int epoll_events)
this->epoll_events |= epoll_events;
if (this->epoll_events & EPOLLIN)
else if (this->epoll_events & (EPOLLRDHUP|EPOLLERR))
delete this;
} }
if (result != 0) else
{ {
parsed.error_code = result; delete this;
} }
int one = 1;
setsockopt(peer_fd, SOL_TCP, TCP_NODELAY, &one, sizeof(one));
tfd->set_fd_handler(peer_fd, false, [this](int peer_fd, int epoll_events)
this->epoll_events |= epoll_events;
} }
void http_co_t::submit_read() void http_co_t::submit_read()
{ {
int res; int res;
if (rbuf.size() != READ_BUFFER_SIZE) if (rbuf.size() != READ_BUFFER_SIZE)
{ {
rbuf.resize(READ_BUFFER_SIZE); rbuf.resize(READ_BUFFER_SIZE);
@ -301,6 +271,7 @@ void http_co_t::submit_read()
read_iov = { .iov_base =, .iov_len = READ_BUFFER_SIZE }; read_iov = { .iov_base =, .iov_len = READ_BUFFER_SIZE };
read_msg.msg_iov = &read_iov; read_msg.msg_iov = &read_iov;
read_msg.msg_iovlen = 1; read_msg.msg_iovlen = 1;
epoll_events = epoll_events & ~EPOLLIN;
res = recvmsg(peer_fd, &read_msg, 0); res = recvmsg(peer_fd, &read_msg, 0);
if (res < 0) if (res < 0)
{ {
@ -308,26 +279,31 @@ void http_co_t::submit_read()
} }
if (res == -EAGAIN) if (res == -EAGAIN)
{ {
epoll_events = epoll_events & ~EPOLLIN; res = 0;
} }
else if (res <= 0) if (res < 0)
{ {
// < 0 means error, 0 means EOF delete this;
if (!res) return;
epoll_events = epoll_events & ~EPOLLIN;
} }
else response += std::string(, res);
if (res == READ_BUFFER_SIZE)
{ {
response += std::string(, res); goto again;
handle_read(); }
if (!handle_read())
if (res < READ_BUFFER_SIZE && (epoll_events & (EPOLLRDHUP|EPOLLERR)))
delete this;
} }
} }
void http_co_t::submit_send() void http_co_t::submit_send()
{ {
int res; int res;
again: again:
if (sent < request.size()) if (sent < request.size())
@ -335,7 +311,7 @@ again:
send_iov = (iovec){ .iov_base = (void*)(request.c_str()+sent), .iov_len = request.size()-sent }; send_iov = (iovec){ .iov_base = (void*)(request.c_str()+sent), .iov_len = request.size()-sent };
send_msg.msg_iov = &send_iov; send_msg.msg_iov = &send_iov;
send_msg.msg_iovlen = 1; send_msg.msg_iovlen = 1;
res = sendmsg(peer_fd, &send_msg, MSG_NOSIGNAL); res = sendmsg(peer_fd, &send_msg, 0);
if (res < 0) if (res < 0)
{ {
res = -errno; res = -errno;
@ -346,17 +322,14 @@ again:
} }
else if (res < 0) else if (res < 0)
{ {
stackout(); delete this;
return; return;
} }
sent += res; sent += res;
{ {
if (sent >= request.size()) if (sent >= request.size())
else else
goto again; goto again;
} }
@ -367,12 +340,10 @@ again:
goto again; goto again;
} }
} }
} }
bool http_co_t::handle_read() bool http_co_t::handle_read()
{ {
if (state == HTTP_CO_REQUEST_SENT) if (state == HTTP_CO_REQUEST_SENT)
{ {
int pos = response.find("\r\n\r\n"); int pos = response.find("\r\n\r\n");
@ -407,8 +378,7 @@ bool http_co_t::handle_read()
if (!target_response_size) if (!target_response_size)
{ {
// Sorry, unsupported response // Sorry, unsupported response
stackout(); delete this;
return false; return false;
} }
} }
@ -416,8 +386,7 @@ bool http_co_t::handle_read()
} }
if (state == HTTP_CO_HEADERS_RECEIVED && target_response_size > 0 && response.size() >= target_response_size) if (state == HTTP_CO_HEADERS_RECEIVED && target_response_size > 0 && response.size() >= target_response_size)
{ {
stackout(); delete this;
return false; return false;
} }
if (state == HTTP_CO_CHUNKED && response.size() > 0) if (state == HTTP_CO_CHUNKED && response.size() > 0)
@ -445,8 +414,7 @@ bool http_co_t::handle_read()
} }
if (parsed.eof) if (parsed.eof)
{ {
stackout(); delete this;
return false; return false;
} }
if (want_streaming && parsed.body.size() > 0) if (want_streaming && parsed.body.size() > 0)
@ -463,13 +431,11 @@ bool http_co_t::handle_read()
parsed.body = ""; parsed.body = "";
} }
} }
return true; return true;
} }
void http_co_t::post_message(int type, const std::string & msg) void http_co_t::post_message(int type, const std::string & msg)
{ {
if (state == HTTP_CO_WEBSOCKET) if (state == HTTP_CO_WEBSOCKET)
{ {
request += ws_format_frame(type, msg.size()); request += ws_format_frame(type, msg.size());
@ -481,7 +447,6 @@ void http_co_t::post_message(int type, const std::string & msg)
ws_outbox += ws_format_frame(type, msg.size()); ws_outbox += ws_format_frame(type, msg.size());
ws_outbox += msg; ws_outbox += msg;
} }
} }
uint64_t stoull_full(const std::string & str, int base) uint64_t stoull_full(const std::string & str, int base)
@ -672,7 +637,7 @@ static int extract_port(std::string & host)
return port; return port;
} }
std::string strtolower(const std::string & in) static std::string strtolower(const std::string & in)
{ {
std::string s = in; std::string s = in;
for (int i = 0; i < s.length(); i++) for (int i = 0; i < s.length(); i++)

View File

@ -1,6 +1,3 @@
// Copyright (c) Vitaliy Filippov, 2019+
// License: VNPL-1.1 or GNU GPL-2.0+ (see for details)
#pragma once #pragma once
#include <string> #include <string>
#include <vector> #include <vector>
@ -49,8 +46,6 @@ std::vector<std::string> getifaddr_list(bool include_v6 = false);
uint64_t stoull_full(const std::string & str, int base = 10); uint64_t stoull_full(const std::string & str, int base = 10);
std::string strtolower(const std::string & in);
void http_request(timerfd_manager_t *tfd, const std::string & host, const std::string & request, void http_request(timerfd_manager_t *tfd, const std::string & host, const std::string & request,
const http_options_t & options, std::function<void(const http_response_t *response)> callback); const http_options_t & options, std::function<void(const http_response_t *response)> callback);


@ -1 +0,0 @@
Subproject commit 97f06cb20c1e136fd37d58fb40f57dd8f8a3a4a7

lambda_size.cpp Normal file
View File

@ -0,0 +1,48 @@
#include <iostream>
#include <functional>
#include <array>
#include <cstdlib> // for malloc() and free()
using namespace std;
// replace operator new and delete to log allocations
void* operator new(std::size_t n)
cout << "Allocating " << n << " bytes" << endl;
return malloc(n);
void operator delete(void* p) throw()
class test
std::string s;
void a(std::function<void()> & f, const char *str)
auto l = [this, str]() { cout << str << " ? " << s << " from this\n"; };
cout << "Assigning lambda3 of size " << sizeof(l) << endl;
f = l;
int main()
std::array<char, 16> arr1;
auto lambda1 = [arr1](){};
cout << "Assigning lambda1 of size " << sizeof(lambda1) << endl;
std::function<void()> f1 = lambda1;
std::array<char, 17> arr2;
auto lambda2 = [arr2](){};
cout << "Assigning lambda2 of size " << sizeof(lambda2) << endl;
std::function<void()> f2 = lambda2;
test t;
std::function<void()> f3;
t.s = "str";
t.a(f3, "huyambda");

View File

@ -1,6 +1,3 @@
// Copyright (c) Vitaliy Filippov, 2019+
// License: VNPL-1.1 (see for details)
// Data distribution optimizer using linear programming (lp_solve) // Data distribution optimizer using linear programming (lp_solve)
const child_process = require('child_process'); const child_process = require('child_process');
@ -28,7 +25,7 @@ async function lp_solve(text)
let vars = {}; let vars = {};
for (const line of stdout.split(/\n/)) for (const line of stdout.split(/\n/))
{ {
let m = /^(^Value of objective function: (-?[\d\.]+)|Actual values of the variables:)\s*$/.exec(line); let m = /^(^Value of objective function: ([\d\.]+)|Actual values of the variables:)\s*$/.exec(line);
if (m) if (m)
{ {
if (m[2]) if (m[2])
@ -50,34 +47,34 @@ async function lp_solve(text)
return { score, vars }; return { score, vars };
} }
async function optimize_initial({ osd_tree, pg_count, pg_size = 3, pg_minsize = 2, max_combinations = 10000, parity_space = 1 }) async function optimize_initial(osd_tree, pg_count, max_combinations)
{ {
if (!pg_count || !osd_tree) max_combinations = max_combinations || 10000;
return null;
const all_weights = Object.assign({}, ...Object.values(osd_tree)); const all_weights = Object.assign({}, ...Object.values(osd_tree));
const total_weight = Object.values(all_weights).reduce((a, c) => Number(a) + Number(c), 0); const total_weight = Object.values(all_weights).reduce((a, c) => Number(a) + Number(c), 0);
const all_pgs = Object.values(random_combinations(osd_tree, pg_size, max_combinations, parity_space > 1)); let all_pgs = all_combinations(osd_tree, null, true);
if (all_pgs.length > max_combinations)
const prob = max_combinations/all_pgs.length;
all_pgs = all_pgs.filter(pg => Math.random() < prob);
const pg_per_osd = {}; const pg_per_osd = {};
for (const pg of all_pgs) for (const pg of all_pgs)
{ {
for (let i = 0; i < pg.length; i++) for (const osd of pg)
{ {
const osd = pg[i];
pg_per_osd[osd] = pg_per_osd[osd] || []; pg_per_osd[osd] = pg_per_osd[osd] || [];
pg_per_osd[osd].push((i >= pg_minsize ? parity_space+'*' : '')+"pg_"+pg.join("_")); pg_per_osd[osd].push("pg_"+pg.join("_"));
} }
} }
const pg_effsize = Math.min(pg_minsize, Object.keys(osd_tree).length) const pg_size = Math.min(Object.keys(osd_tree).length, 3);
+ Math.max(0, Math.min(pg_size, Object.keys(osd_tree).length) - pg_minsize) * parity_space;
let lp = ''; let lp = '';
lp += "max: " => 'pg_'+pg.join('_')).join(' + ')+";\n"; lp += "max: " => 'pg_'+pg.join('_')).join(' + ')+";\n";
for (const osd in pg_per_osd) for (const osd in pg_per_osd)
{ {
if (osd !== NO_OSD) if (osd !== NO_OSD)
{ {
let osd_pg_count = all_weights[osd]/total_weight*pg_effsize*pg_count; let osd_pg_count = all_weights[osd]/total_weight*pg_size*pg_count;
lp += pg_per_osd[osd].join(' + ')+' <= '+osd_pg_count+';\n'; lp += pg_per_osd[osd].join(' + ')+' <= '+osd_pg_count+';\n';
} }
} }
@ -89,19 +86,11 @@ async function optimize_initial({ osd_tree, pg_count, pg_size = 3, pg_minsize =
const lp_result = await lp_solve(lp); const lp_result = await lp_solve(lp);
if (!lp_result) if (!lp_result)
{ {
throw new Error('Problem is infeasible or unbounded - is it a bug?'); throw new Error('Problem is infeasible or unbounded - is it a bug?');
} }
const int_pgs = make_int_pgs(lp_result.vars, pg_count); const int_pgs = make_int_pgs(lp_result.vars, pg_count);
const eff = pg_list_space_efficiency(int_pgs, all_weights, pg_minsize, parity_space); const eff = pg_list_space_efficiency(int_pgs, all_weights);
const res = { return { score: lp_result.score, weights: lp_result.vars, int_pgs, space: eff*pg_size, total_space: total_weight };
score: lp_result.score,
weights: lp_result.vars,
space: eff * pg_effsize,
total_space: total_weight,
return res;
} }
function make_int_pgs(weights, pg_count) function make_int_pgs(weights, pg_count)
@ -123,117 +112,11 @@ function make_int_pgs(weights, pg_count)
return int_pgs; return int_pgs;
} }
function calc_intersect_weights(pg_size, pg_count, prev_weights, all_pgs)
const move_weights = {};
if ((1 << pg_size) < pg_count)
const intersect = {};
for (const pg_name in prev_weights)
const pg = pg_name.substr(3).split(/_/);
for (let omit = 1; omit < (1 << pg_size); omit++)
let pg_omit = [ ];
let intersect_count = pg_size;
for (let i = 0; i < pg_size; i++)
if (omit & (1 << i))
pg_omit[i] = '';
pg_omit = pg_omit.join(':');
intersect[pg_omit] = Math.max(intersect[pg_omit] || 0, intersect_count);
for (const pg of all_pgs)
let max_int = 0;
for (let omit = 1; omit < (1 << pg_size); omit++)
let pg_omit = [ ];
for (let i = 0; i < pg_size; i++)
if (omit & (1 << i))
pg_omit[i] = '';
pg_omit = pg_omit.join(':');
max_int = Math.max(max_int, intersect[pg_omit] || 0);
move_weights['pg_'+pg.join('_')] = pg_size-max_int;
const prev_pg_hashed = Object.keys(prev_weights).map(pg_name => pg_name.substr(3).split(/_/).reduce((a, c) => { a[c] = 1; return a; }, {}));
for (const pg of all_pgs)
if (!prev_weights['pg_'+pg.join('_')])
let max_int = 0;
for (const prev_hash in prev_pg_hashed)
const intersect_count = pg.reduce((a, osd) => a + (prev_hash[osd] ? 1 : 0), 0);
if (max_int < intersect_count)
max_int = intersect_count;
if (max_int >= pg_size)
move_weights['pg_'+pg.join('_')] = pg_size-max_int;
return move_weights;
function add_valid_previous(osd_tree, prev_weights, all_pgs)
// Add previous combinations that are still valid
const hosts = Object.keys(osd_tree).sort();
const host_per_osd = {};
for (const host in osd_tree)
for (const osd in osd_tree[host])
host_per_osd[osd] = host;
skip_pg: for (const pg_name in prev_weights)
const seen_hosts = {};
const pg = pg_name.substr(3).split(/_/);
for (const osd of pg)
if (!host_per_osd[osd] || seen_hosts[host_per_osd[osd]])
continue skip_pg;
seen_hosts[host_per_osd[osd]] = true;
if (!all_pgs[pg_name])
all_pgs[pg_name] = pg;
// Try to minimize data movement // Try to minimize data movement
async function optimize_change({ prev_pgs: prev_int_pgs, osd_tree, pg_size = 3, pg_minsize = 2, max_combinations = 10000, parity_space = 1 }) async function optimize_change(prev_int_pgs, osd_tree, max_combinations)
{ {
if (!osd_tree) max_combinations = max_combinations || 10000;
{ const pg_size = Math.min(Object.keys(osd_tree).length, 3);
return null;
const pg_effsize = Math.min(pg_minsize, Object.keys(osd_tree).length)
+ Math.max(0, Math.min(pg_size, Object.keys(osd_tree).length) - pg_minsize) * parity_space;
const pg_count = prev_int_pgs.length; const pg_count = prev_int_pgs.length;
const prev_weights = {}; const prev_weights = {};
const prev_pg_per_osd = {}; const prev_pg_per_osd = {};
@ -241,55 +124,70 @@ async function optimize_change({ prev_pgs: prev_int_pgs, osd_tree, pg_size = 3,
{ {
const pg_name = 'pg_'+pg.join('_'); const pg_name = 'pg_'+pg.join('_');
prev_weights[pg_name] = (prev_weights[pg_name]||0) + 1; prev_weights[pg_name] = (prev_weights[pg_name]||0) + 1;
for (let i = 0; i < pg.length; i++) for (const osd of pg)
{ {
const osd = pg[i];
prev_pg_per_osd[osd] = prev_pg_per_osd[osd] || []; prev_pg_per_osd[osd] = prev_pg_per_osd[osd] || [];
prev_pg_per_osd[osd].push([ pg_name, (i >= pg_minsize ? parity_space : 1) ]); prev_pg_per_osd[osd].push(pg_name);
} }
} }
// Get all combinations // Get all combinations
let all_pgs = random_combinations(osd_tree, pg_size, max_combinations, parity_space > 1); let all_pgs = all_combinations(osd_tree, null, true);
add_valid_previous(osd_tree, prev_weights, all_pgs); if (all_pgs.length > max_combinations)
all_pgs = Object.values(all_pgs); {
const intersecting = all_pgs.filter(pg => prev_weights['pg_'+pg.join('_')]);
if (intersecting.length > max_combinations)
const prob = max_combinations/intersecting.length;
all_pgs = intersecting.filter(pg => Math.random() < prob);
const prob = (max_combinations-intersecting.length)/all_pgs.length;
all_pgs = all_pgs.filter(pg => Math.random() < prob || prev_weights['pg_'+pg.join('_')]);
const pg_per_osd = {}; const pg_per_osd = {};
for (const pg of all_pgs) for (const pg of all_pgs)
{ {
const pg_name = 'pg_'+pg.join('_'); const pg_name = 'pg_'+pg.join('_');
for (let i = 0; i < pg.length; i++) for (const osd of pg)
{ {
const osd = pg[i];
pg_per_osd[osd] = pg_per_osd[osd] || []; pg_per_osd[osd] = pg_per_osd[osd] || [];
pg_per_osd[osd].push([ pg_name, (i >= pg_minsize ? parity_space : 1) ]); pg_per_osd[osd].push(pg_name);
} }
} }
// Penalize PGs based on their similarity to old PGs // Penalize PGs based on their similarity to old PGs
const move_weights = calc_intersect_weights(pg_size, pg_count, prev_weights, all_pgs); const intersect = {};
for (const pg_name in prev_weights)
const pg = pg_name.substr(3).split(/_/);
intersect[pg[0]+'::'] = intersect[':'+pg[1]+':'] = intersect['::'+pg[2]] = 2;
intersect[pg[0]+'::'+pg[2]] = intersect[':'+pg[1]+':'+pg[2]] = intersect[pg[0]+':'+pg[1]+':'] = 1;
const move_weights = {};
for (const pg of all_pgs)
move_weights['pg_'+pg.join('_')] =
intersect[pg[0]+'::'+pg[2]] || intersect[':'+pg[1]+':'+pg[2]] || intersect[pg[0]+':'+pg[1]+':'] ||
intersect[pg[0]+'::'] || intersect[':'+pg[1]+':'] || intersect['::'+pg[2]] ||
// Calculate total weight - old PG weights // Calculate total weight - old PG weights
const all_pg_names = => 'pg_'+pg.join('_')); const all_pg_names = => 'pg_'+pg.join('_'));
const all_pgs_hash = all_pg_names.reduce((a, c) => { a[c] = true; return a; }, {});
const all_weights = Object.assign({}, ...Object.values(osd_tree)); const all_weights = Object.assign({}, ...Object.values(osd_tree));
const total_weight = Object.values(all_weights).reduce((a, c) => Number(a) + Number(c), 0); const total_weight = Object.values(all_weights).reduce((a, c) => Number(a) + Number(c), 0);
// Generate the LP problem // Generate the LP problem
let lp = ''; let lp = '';
lp += 'max: ' => ( lp += 'max: ' => (
prev_weights[pg_name] ? `${pg_size+1}*add_${pg_name} - ${pg_size+1}*del_${pg_name}` : `${pg_size+1-move_weights[pg_name]}*${pg_name}` prev_weights[pg_name] ? `${4-move_weights[pg_name]}*add_${pg_name} - 4*del_${pg_name}` : `${4-move_weights[pg_name]}*${pg_name}`
)).join(' + ')+';\n'; )).join(' + ')+';\n';
lp += all_pg_names
.map(pg_name => (prev_weights[pg_name] ? `add_${pg_name} - del_${pg_name}` : `${pg_name}`))
.join(' + ')+' = '+(pg_count
- Object.keys(prev_weights).reduce((a, old_pg_name) => (a + (all_pgs_hash[old_pg_name] ? prev_weights[old_pg_name] : 0)), 0)
for (const osd in pg_per_osd) for (const osd in pg_per_osd)
{ {
if (osd !== NO_OSD) if (osd !== NO_OSD)
{ {
const osd_sum = (pg_per_osd[osd]||[]).map(([ pg_name, space ]) => ( const osd_sum = (pg_per_osd[osd]||[]).map(pg_name => prev_weights[pg_name] ? `add_${pg_name} - del_${pg_name}` : pg_name).join(' + ');
prev_weights[pg_name] ? `${space} * add_${pg_name} - ${space} * del_${pg_name}` : `${space} * ${pg_name}` const rm_osd_pg_count = (prev_pg_per_osd[osd]||[]).filter(old_pg_name => move_weights[old_pg_name]).length;
)).join(' + '); let osd_pg_count = all_weights[osd]*3/total_weight*pg_count - rm_osd_pg_count;
const rm_osd_pg_count = (prev_pg_per_osd[osd]||[])
.reduce((a, [ old_pg_name, space ]) => (a + (all_pgs_hash[old_pg_name] ? space : 0)), 0);
const osd_pg_count = all_weights[osd]*pg_effsize/total_weight*pg_count - rm_osd_pg_count;
lp += osd_sum + ' <= ' + osd_pg_count + ';\n'; lp += osd_sum + ' <= ' + osd_pg_count + ';\n';
} }
} }
@ -323,7 +221,7 @@ async function optimize_change({ prev_pgs: prev_int_pgs, osd_tree, pg_size = 3,
const weights = { ...prev_weights }; const weights = { ...prev_weights };
for (const k in prev_weights) for (const k in prev_weights)
{ {
if (!all_pgs_hash[k]) if (!move_weights[k])
{ {
delete weights[k]; delete weights[k];
} }
@ -338,7 +236,7 @@ async function optimize_change({ prev_pgs: prev_int_pgs, osd_tree, pg_size = 3,
{ {
weights[k.substr(4)] = (weights[k.substr(4)] || 0) - Number(lp_result.vars[k]); weights[k.substr(4)] = (weights[k.substr(4)] || 0) - Number(lp_result.vars[k]);
} }
else if (k.substr(0, 3) === 'pg_') else
{ {
weights[k] = Number(lp_result.vars[k]); weights[k] = Number(lp_result.vars[k]);
} }
@ -360,7 +258,7 @@ async function optimize_change({ prev_pgs: prev_int_pgs, osd_tree, pg_size = 3,
{ {
differs++; differs++;
} }
for (let j = 0; j < pg_size; j++) for (let j = 0; j < 3; j++)
{ {
if (new_pgs[i][j] != prev_int_pgs[i][j]) if (new_pgs[i][j] != prev_int_pgs[i][j])
{ {
@ -375,7 +273,7 @@ async function optimize_change({ prev_pgs: prev_int_pgs, osd_tree, pg_size = 3,
int_pgs: new_pgs, int_pgs: new_pgs,
differs, differs,
osd_differs, osd_differs,
space: pg_effsize * pg_list_space_efficiency(new_pgs, all_weights, pg_minsize, parity_space), space: pg_size * pg_list_space_efficiency(new_pgs, all_weights),
total_space: total_weight, total_space: total_weight,
}; };
} }
@ -493,155 +391,64 @@ function extract_osds(osd_tree, levels, osd_level, osds = {})
return osds; return osds;
} }
// ordered = don't treat (x,y) and (y,x) as equal // FIXME: support different pg_sizes, not just 3
function random_combinations(osd_tree, pg_size, count, ordered)
let seed = 0x5f020e43;
let rng = () =>
seed ^= seed << 13;
seed ^= seed >> 17;
seed ^= seed << 5;
return seed + 2147483648;
const hosts = Object.keys(osd_tree).sort();
const osds = Object.keys(osd_tree).reduce((a, c) => { a[c] = Object.keys(osd_tree[c]).sort(); return a; }, {});
const r = {};
// Generate random combinations including each OSD at least once
for (let h = 0; h < hosts.length; h++)
for (let o = 0; o < osds[hosts[h]].length; o++)
const pg = [ osds[hosts[h]][o] ];
const cur_hosts = [ ...hosts ];
cur_hosts.splice(h, 1);
for (let i = 1; i < pg_size && i < hosts.length; i++)
const next_host = rng() % cur_hosts.length;
const next_osd = rng() % osds[cur_hosts[next_host]].length;
cur_hosts.splice(next_host, 1);
const cyclic_pgs = [ pg ];
if (ordered)
for (let i = 1; i < pg.size; i++)
cyclic_pgs.push([,, i) ]);
for (const pg of cyclic_pgs)
while (pg.length < pg_size)
r['pg_'+pg.join('_')] = pg;
// Generate purely random combinations
while (count > 0)
let host_idx = [];
const cur_hosts = [, i) => i) ];
const max_hosts = pg_size < hosts.length ? pg_size : hosts.length;
if (ordered)
for (let i = 0; i < max_hosts; i++)
const r = rng() % cur_hosts.length;
host_idx[i] = cur_hosts[r];
cur_hosts.splice(r, 1);
for (let i = 0; i < max_hosts; i++)
const r = rng() % (cur_hosts.length - (max_hosts - i - 1));
host_idx[i] = cur_hosts[r];
cur_hosts.splice(0, r+1);
let pg = => osds[hosts[h]][rng() % osds[hosts[h]].length]);
while (pg.length < pg_size)
r['pg_'+pg.join('_')] = pg;
return r;
// Super-stupid algorithm. Given the current OSD tree, generate all possible OSD combinations
// osd_tree = { failure_domain1: { osd1: size1, ... }, ... } // osd_tree = { failure_domain1: { osd1: size1, ... }, ... }
// ordered = return combinations without duplicates having different order function all_combinations(osd_tree, count, ordered)
function all_combinations(osd_tree, pg_size, ordered, count)
{ {
const hosts = Object.keys(osd_tree).sort(); const hosts = Object.keys(osd_tree).sort();
const osds = Object.keys(osd_tree).reduce((a, c) => { a[c] = Object.keys(osd_tree[c]).sort(); return a; }, {}); const osds = Object.keys(osd_tree).reduce((a, c) => { a[c] = Object.keys(osd_tree[c]).sort(); return a; }, {});
while (hosts.length < pg_size) while (hosts.length < 3)
{ {
osds[NO_OSD] = [ NO_OSD ]; osds[NO_OSD] = [ NO_OSD ];
hosts.push(NO_OSD); hosts.push(NO_OSD);
} }
let host_idx = []; let host_idx = [ 0, 1, 2 ];
let osd_idx = []; let osd_idx = [ 0, 0, 0 ];
for (let i = 0; i < pg_size; i++)
const r = []; const r = [];
while (!count || count < 0 || r.length < count) while (!count || count < 0 || r.length < count)
{ {
r.push(, i) => osds[hosts[hi]][osd_idx[i]])); let inc;
let inc = pg_size-1; if (host_idx[2] != host_idx[1] && host_idx[2] != host_idx[0] && host_idx[1] != host_idx[0])
while (inc >= 0)
{ {
osd_idx[inc]++; r.push(, i) => osds[hosts[hi]][osd_idx[i]]));
if (osd_idx[inc] >= osds[hosts[host_idx[inc]]].length) inc = 2;
while (inc >= 0)
{ {
osd_idx[inc] = 0; osd_idx[inc]++;
inc--; if (osd_idx[inc] >= osds[hosts[host_idx[inc]]].length)
if (inc < 0)
// no osds left in the current host combination, select the next one
inc = pg_size-1;
same_again: while (inc >= 0)
for (let prev_host = 0; prev_host < inc; prev_host++)
{ {
if (host_idx[prev_host] == host_idx[inc]) osd_idx[inc] = 0;
{ inc--;
continue same_again;
if (host_idx[inc] < (ordered ? hosts.length-(pg_size-1-inc) : hosts.length))
while ((++inc) < pg_size)
host_idx[inc] = (ordered ? host_idx[inc-1]+1 : 0);
} }
else else
{ {
inc--; break;
} }
} }
if (inc < 0) }
inc = -1;
if (inc < 0)
// no osds left in current host combination, select the next one
osd_idx = [ 0, 0, 0 ];
if (host_idx[2] >= hosts.length)
{ {
break; host_idx[1]++;
host_idx[2] = ordered ? host_idx[1]+1 : 0;
if ((ordered ? host_idx[2] : host_idx[1]) >= hosts.length)
host_idx[1] = ordered ? host_idx[0]+1 : 0;
host_idx[2] = ordered ? host_idx[1]+1 : 0;
if ((ordered ? host_idx[2] : host_idx[0]) >= hosts.length)
} }
} }
} }
@ -661,15 +468,14 @@ function pg_weights_space_efficiency(weights, pg_count, osd_sizes)
return pg_per_osd_space_efficiency(per_osd, pg_count, osd_sizes); return pg_per_osd_space_efficiency(per_osd, pg_count, osd_sizes);
} }
function pg_list_space_efficiency(pgs, osd_sizes, pg_minsize, parity_space) function pg_list_space_efficiency(pgs, osd_sizes)
{ {
const per_osd = {}; const per_osd = {};
for (const pg of pgs) for (const pg of pgs)
{ {
for (let i = 0; i < pg.length; i++) for (const osd of pg)
{ {
const osd = pg[i]; per_osd[osd] = (per_osd[osd]||0) + 1;
per_osd[osd] = (per_osd[osd]||0) + (i >= pg_minsize ? (parity_space||1) : 1);
} }
} }
return pg_per_osd_space_efficiency(per_osd, pgs.length, osd_sizes); return pg_per_osd_space_efficiency(per_osd, pgs.length, osd_sizes);
@ -711,6 +517,5 @@ module.exports = {
lp_solve, lp_solve,
make_int_pgs, make_int_pgs,
align_pgs, align_pgs,
all_combinations, all_combinations,
}; };

mon/mon-main.js → lp/mon-main.js Executable file → Normal file
View File

@ -1,8 +1,5 @@
#!/usr/bin/node #!/usr/bin/node
// Copyright (c) Vitaliy Filippov, 2019+
// License: VNPL-1.1 (see for details)
const Mon = require('./mon.js'); const Mon = require('./mon.js');
const options = {}; const options = {};
@ -18,8 +15,8 @@ for (let i = 2; i < process.argv.length; i++)
if (!options.etcd_url) if (!options.etcd_url)
{ {
console.error('USAGE: '+process.argv[0]+' '+process.argv[1]+' --etcd_url ",..." --etcd_prefix "/vitastor" --etcd_start_timeout 5 [--verbose 1]'); console.error('USAGE: '+process.argv[0]+' '+process.argv[1]+' --etcd_url ",..." --etcd_prefix "/rage" --etcd_start_timeout 5');
process.exit(); process.exit();
} }
new Mon(options).start().catch(e => { console.error(e); process.exit(); }); new Mon(options).start();

lp/mon.js Normal file
View File

@ -0,0 +1,858 @@
const http = require('http');
const os = require('os');
const WebSocket = require('ws');
const LPOptimizer = require('./lp-optimizer.js');
const stableStringify = require('./stable-stringify.js');
class Mon
static etcd_tree = {
config: {
global: null,
/* placement_tree = {
levels: { datacenter: 1, rack: 2, host: 3, osd: 4, ... },
nodes: { host1: { level: 'host', parent: 'rack1' }, ... },
failure_domain: 'host',
} */
placement_tree: null,
osd: {},
pgs: {},
osd: {
state: {},
stats: {},
mon: {
master: null,
pg: {
change_stamp: null,
state: {},
stats: {},
history: {},
// FIXME: Maybe prefer local etcd
this.etcd_urls = [];
for (let url of config.etcd_url.split(/,/))
let scheme = 'http';
url = url.trim().replace(/^(https?):\/\//, (m, m1) => { scheme = m1; return ''; });
if (!/\/[^\/]/.exec(url))
url += '/v3';
this.etcd_prefix = config.etcd_prefix || '/rage';
this.etcd_prefix = this.etcd_prefix.replace(/\/\/+/g, '/').replace(/^\/?(.*[^\/])\/?$/, '/$1');
this.etcd_start_timeout = (config.etcd_start_timeout || 5) * 1000;
this.state = JSON.parse(JSON.stringify(Mon.etcd_tree));
async start()
await this.load_config();
await this.get_lease();
await this.become_master();
await this.load_cluster_state();
await this.start_watcher();
await this.recheck_pgs();
async load_config()
const res = await this.etcd_call('/txn', { success: [
{ requestRange: { key: b64(this.etcd_prefix+'/config/global') } }
] }, this.etcd_start_timeout, -1);
this.config.etcd_mon_timeout = Number(this.config.etcd_mon_timeout) || 0;
if (this.config.etcd_mon_timeout <= 0)
this.config.etcd_mon_timeout = 1000;
this.config.etcd_mon_retries = Number(this.config.etcd_mon_retries) || 5;
if (this.config.etcd_mon_retries < 0)
this.config.etcd_mon_retries = 0;
this.config.mon_change_timeout = Number(this.config.mon_change_timeout) || 1000;
if (this.config.mon_change_timeout < 100)
this.config.mon_change_timeout = 100;
this.config.mon_stats_timeout = Number(this.config.mon_stats_timeout) || 1000;
if (this.config.mon_stats_timeout < 100)
this.config.mon_stats_timeout = 100;
// After this number of seconds, a dead OSD will be removed from PG distribution
this.config.osd_out_time = Number(this.config.osd_out_time) || 0;
if (!this.config.osd_out_time)
this.config.osd_out_time = 30*60; // 30 minutes by default
this.config.max_osd_combinations = Number(this.config.max_osd_combinations) || 10000;
if (this.config.max_osd_combinations < 100)
this.config.max_osd_combinations = 100;
async start_watcher(retries)
let retry = 0;
if (retries >= 0 && retries < 1)
retries = 1;
while (retries < 0 || retry < retries)
const base = 'ws'+this.etcd_urls[Math.floor(Math.random()*this.etcd_urls.length)].substr(4);
const ok = await new Promise((ok, no) =>
const timer_id = setTimeout(() =>
}, timeout); = new WebSocket(base+'/watch');'open', () =>
if (timer_id)
if (!ok)
{ = null;
if (!
this.die('Failed to open etcd watch websocket');
create_request: {
key: b64(this.etcd_prefix+'/'),
range_end: b64(this.etcd_prefix+'0'),
start_revision: ''+this.etcd_watch_revision,
watch_id: 1,
}));'message', (msg) =>
let data;
data = JSON.parse(msg);
catch (e)
if (!data || !data.result || !
console.error('Garbage received from watch websocket: '+msg);
let stats_changed = false, changed = false;
console.log('Revision '+data.result.header.revision+' events: ');
for (const e of
const key = e.kv.key.substr(this.etcd_prefix.length);
if (key.substr(0, 11) == '/osd/stats/' || key.substr(0, 10) == '/pg/stats/')
stats_changed = true;
else if (key != '/stats')
changed = true;
if (stats_changed)
if (changed)
async get_lease()
const max_ttl = this.config.etcd_mon_ttl + this.config.etcd_mon_timeout/1000*this.config.etcd_mon_retries;
const res = await this.etcd_call('/lease/grant', { TTL: max_ttl }, this.config.etcd_mon_timeout, this.config.etcd_mon_retries);
this.etcd_lease_id = res.ID;
setInterval(async () =>
const res = await this.etcd_call('/lease/keepalive', { ID: this.etcd_lease_id }, this.config.etcd_mon_timeout, this.config.etcd_mon_retries);
if (!res.result.TTL)
this.die('Lease expired');
}, config.etcd_mon_timeout);
async become_master()
const state = { ip: this.local_ips() };
while (1)
const res = await this.etcd_call('/txn', {
compare: [ { target: 'CREATE', create_revision: 0, key: b64(this.etcd_prefix+'/mon/master') } ],
success: [ { key: b64(this.etcd_prefix+'/mon/master'), value: b64(JSON.stringify(state)), lease: ''+this.etcd_lease_id } ],
}, this.etcd_start_timeout, 0);
if (!res.succeeded)
await new Promise(ok => setTimeout(ok, this.etcd_start_timeout));
async load_cluster_state()
const res = await this.etcd_call('/txn', { success: [
{ requestRange: { key: b64(this.etcd_prefix+'/'), range_end: b64(this.etcd_prefix+'0') } },
] }, this.etcd_start_timeout, -1);
this.etcd_watch_revision = BigInt(res.header.revision)+BigInt(1);
const data = JSON.parse(JSON.stringify(Mon.etcd_tree));
for (const response of res.responses)
for (const kv of response.response_range.kvs)
this.state = data;
return Object.keys(this.state.osd.stats);
this.state.config.placement_tree = this.state.config.placement_tree||{};
const levels = this.state.config.placement_tree.levels||{}; = || 100;
levels.osd = levels.osd || 101;
const tree = { '': { children: [] } };
for (const node_id in this.state.config.placement_tree.nodes||{})
const node_cfg = this.state.config.placement_tree.nodes[node_id];
if (!node_id || /^\d/.exec(node_id) ||
!node_cfg.level || !levels[node_cfg.level])
// All nodes must have non-empty non-numeric IDs and valid levels
tree[node_id] = { id: node_id, level: node_cfg.level, parent: node_cfg.parent, children: [] };
// This requires monitor system time to be in sync with OSD system times (at least to some extent)
const down_time = - this.config.osd_out_time;
for (const osd_num of this.all_osds().sort((a, b) => a - b))
const stat = this.state.osd.stats[osd_num];
if (stat.size && (this.state.osd.state[osd_num] || Number(stat.time) >= down_time))
// Numeric IDs are reserved for OSDs
const reweight = this.state.config.osd[osd_num] && Number(this.state.config.osd[osd_num].reweight) || 1;
tree[osd_num] = tree[osd_num] || { id: osd_num, parent: };
tree[osd_num].level = 'osd';
tree[osd_num].size = reweight * stat.size / 1024 / 1024 / 1024 / 1024; // terabytes
delete tree[osd_num].children;
for (const node_id in tree)
if (node_id === '')
const node_cfg = tree[node_id];
const node_level = levels[node_cfg.level] || node_cfg.level;
let parent_level = node_cfg.parent && tree[node_cfg.parent] && tree[node_cfg.parent].children
&& tree[node_cfg.parent].level;
parent_level = parent_level ? (levels[parent_level] || parent_level) : null;
// Parent's level must be less than child's; OSDs must be leaves
const parent = parent_level && parent_level < node_level ? tree[node_cfg.parent] : '';
delete node_cfg.parent;
return LPOptimizer.flatten_tree(tree[''].children, levels, this.state.config.failure_domain, 'osd');
async stop_all_pgs()
let has_online = false, paused = true;
for (const pg in this.state.config.pgs.items||{})
const cur_state = (([pg]||{}).state||[]).join(',');
if (cur_state != '' && cur_state != 'offline')
has_online = true;
if (!this.state.config.pgs.items[pg].pause)
paused = false;
if (!paused)
console.log('Stopping all PGs before changing PG count');
const new_cfg = JSON.parse(JSON.stringify(this.state.config.pgs));
for (const pg in new_cfg.items)
new_cfg.items[pg].pause = true;
// Check that no OSDs change their state before we pause PGs
// Doing this we make sure that OSDs don't wake up in the middle of our "transaction"
// and can't see the old PG configuration
const checks = [];
for (const osd_num of this.all_osds())
const key = b64(this.etcd_prefix+'/osd/state/'+osd_num);
checks.push({ key, target: 'MOD', result: 'LESS', mod_revision: ''+this.etcd_watch_revision });
const res = await this.etcd_call('/txn', {
compare: [
{ key: b64(this.etcd_prefix+'/mon/master'), target: 'LEASE', lease: ''+this.etcd_lease_id },
{ key: b64(this.etcd_prefix+'/config/pgs'), target: 'MOD', mod_revision: ''+this.etcd_watch_revision, result: 'LESS' },
success: [
{ requestPut: { key: b64(this.etcd_prefix+'/config/pgs'), value: b64(JSON.stringify(new_cfg)) } },
}, this.config.etcd_mon_timeout, 0);
if (!res.succeeded)
return false;
this.state.config.pgs = new_cfg;
return !has_online;
scale_pg_count(prev_pgs, pg_history, new_pg_count)
const old_pg_count = prev_pgs.length;
// Add all possibly intersecting PGs into the history of new PGs
if (!(new_pg_count % old_pg_count))
// New PG count is a multiple of the old PG count
const mul = (new_pg_count / old_pg_count);
for (let i = 0; i < new_pg_count; i++)
const old_i = Math.floor(new_pg_count / mul);
pg_history[i] = JSON.parse(JSON.stringify([1+old_i]));
else if (!(old_pg_count % new_pg_count))
// Old PG count is a multiple of the new PG count
const mul = (old_pg_count / new_pg_count);
for (let i = 0; i < new_pg_count; i++)
pg_history[i] = {
osd_sets: [],
all_peers: [],
for (let j = 0; j < mul; j++)
const hist =[1+i*mul+j];
if (hist && hist.osd_sets && hist.osd_sets.length)
Array.prototype.push.apply(pg_history[i].osd_sets, hist.osd_sets);
if (hist && hist.all_peers && hist.all_peers.length)
Array.prototype.push.apply(pg_history[i].all_peers, hist.all_peers);
// Any PG may intersect with any PG after non-multiple PG count change
// So, merge ALL PGs history
let all_sets = {};
let all_peers = {};
for (const pg of prev_pgs)
all_sets[pg.join(' ')] = pg;
for (const pg in
const hist =[pg];
if (hist && hist.osd_sets)
for (const pg of hist.osd_sets)
all_sets[pg.join(' ')] = pg;
if (hist && hist.all_peers)
for (const osd_num of hist.all_peers)
all_peers[osd_num] = Number(osd_num);
all_sets = Object.values(all_sets);
all_peers = Object.values(all_peers);
for (let i = 0; i < new_pg_count; i++)
pg_history[i] = { osd_sets: all_sets, all_peers };
// Mark history keys for removed PGs as removed
for (let i = new_pg_count; i < old_pg_count; i++)
pg_history[i] = null;
if (old_pg_count < new_pg_count)
for (let i = new_pg_count-1; i >= 0; i--)
prev_pgs[i] = prev_pgs[Math.floor(i/new_pg_count*old_pg_count)];
else if (old_pg_count > new_pg_count)
for (let i = 0; i < new_pg_count; i++)
prev_pgs[i] = prev_pgs[Math.round(i/new_pg_count*old_pg_count)];
prev_pgs.splice(new_pg_count, old_pg_count-new_pg_count);
async save_new_pgs(prev_pgs, new_pgs, pg_history, tree_hash)
const txn = [], checks = [];
const pg_items = {};, i) =>
osd_set = => osd_num === LPOptimizer.NO_OSD ? 0 : osd_num);
const alive_set = osd_set.filter(osd_num => osd_num);
pg_items[i+1] = {
primary: alive_set.length ? alive_set[Math.floor(Math.random()*alive_set.length)] : 0,
if (prev_pgs[i] && prev_pgs[i].join(' ') != osd_set.join(' '))
pg_history[i] = pg_history[i] || {};
pg_history[i].osd_sets = pg_history[i].osd_sets || [];
for (let i = 0; i < new_pgs.length || i < prev_pgs.length; i++)
key: b64(this.etcd_prefix+'/pg/history/'+(i+1)),
target: 'MOD',
mod_revision: ''+this.etcd_watch_revision,
result: 'LESS',
if (pg_history[i])
requestPut: {
key: b64(this.etcd_prefix+'/pg/history/'+(i+1)),
value: b64(JSON.stringify(pg_history[i])),
requestDeleteRange: {
key: b64(this.etcd_prefix+'/pg/history/'+(i+1)),
this.state.config.pgs = {
hash: tree_hash,
items: pg_items,
const res = await this.etcd_call('/txn', {
compare: [
{ key: b64(this.etcd_prefix+'/mon/master'), target: 'LEASE', lease: ''+this.etcd_lease_id },
{ key: b64(this.etcd_prefix+'/config/pgs'), target: 'MOD', mod_revision: ''+this.etcd_watch_revision, result: 'LESS' },
success: [
{ requestPut: { key: b64(this.etcd_prefix+'/config/pgs'), value: b64(JSON.stringify(this.state.config.pgs)) } },
}, this.config.etcd_mon_timeout, 0);
return res.succeeded;
async recheck_pgs()
// Take configuration and state, check it against the stored configuration hash
// Recalculate PGs and save them to etcd if the configuration is changed
const tree_cfg = {
osd_tree: this.get_osd_tree(),
pg_count: this.config.pg_count || Object.keys(this.state.config.pgs.items||{}).length || 128,
max_osd_combinations: this.config.max_osd_combinations,
const tree_hash = sha1hex(stableStringify(tree_cfg));
if (this.state.config.pgs.hash != tree_hash)
// Something has changed
const prev_pgs = [];
for (const pg in this.state.config.pgs.items||{})
prev_pgs[pg-1] = this.state.config.pgs.items[pg].osd_set;
const pg_history = [];
const old_pg_count = prev_pgs.length;
let optimize_result;
if (old_pg_count > 0)
if (old_pg_count != tree_cfg.pg_count)
// PG count changed. Need to bring all PGs down.
if (!await this.stop_all_pgs())
this.scale_pg_count(prev_pgs, pg_history, new_pg_count);
optimize_result = await LPOptimizer.optimize_change(prev_pgs, tree_cfg.osd_tree, tree_cfg.max_osd_combinations);
optimize_result = await LPOptimizer.optimize_initial(tree_cfg.osd_tree, tree_cfg.pg_count, tree_cfg.max_osd_combinations);
if (!await this.save_new_pgs(prev_pgs, optimize_result.int_pgs, pg_history, tree_hash))
console.log('Someone changed PG configuration while we also tried to change it. Retrying in '+this.config.mon_change_timeout+' ms');
console.log('PG configuration successfully changed');
if (old_pg_count != optimize_result.int_pgs.length)
console.log(`PG count changed from: ${old_pg_count} to ${optimize_result.int_pgs.length}`);
if (this.recheck_timer)
this.recheck_timer = null;
this.recheck_timer = setTimeout(() =>
this.recheck_timer = null;
}, this.config.mon_change_timeout || 1000);
let overflow = false;
this.prev_stats = this.prev_stats || { op_stats: {}, subop_stats: {}, recovery_stats: {} };
const op_stats = {}, subop_stats = {}, recovery_stats = {};
for (const osd in this.state.osd.stats)
const st = this.state.osd.stats[osd];
for (const op in st.op_stats||{})
op_stats[op] = op_stats[op] || { count: 0n, usec: 0n, bytes: 0n };
op_stats[op].count += BigInt(st.op_stats.count||0);
op_stats[op].usec += BigInt(st.op_stats.usec||0);
op_stats[op].bytes += BigInt(st.op_stats.bytes||0);
for (const op in st.subop_stats||{})
subop_stats[op] = subop_stats[op] || { count: 0n, usec: 0n };
subop_stats[op].count += BigInt(st.subop_stats.count||0);
subop_stats[op].usec += BigInt(st.subop_stats.usec||0);
for (const op in st.recovery_stats||{})
recovery_stats[op] = recovery_stats[op] || { count: 0n, bytes: 0n };
recovery_stats[op].count += BigInt(st.recovery_stats.count||0);
recovery_stats[op].bytes += BigInt(st.recovery_stats.bytes||0);
for (const op in op_stats)
if (op_stats[op].count >= 0x10000000000000000n)
if (!this.prev_stats.op_stats[op])
overflow = true;
op_stats[op].count -= this.prev_stats.op_stats[op].count;
op_stats[op].usec -= this.prev_stats.op_stats[op].usec;
op_stats[op].bytes -= this.prev_stats.op_stats[op].bytes;
for (const op in subop_stats)
if (subop_stats[op].count >= 0x10000000000000000n)
if (!this.prev_stats.subop_stats[op])
overflow = true;
subop_stats[op].count -= this.prev_stats.subop_stats[op].count;
subop_stats[op].usec -= this.prev_stats.subop_stats[op].usec;
for (const op in recovery_stats)
if (recovery_stats[op].count >= 0x10000000000000000n)
if (!this.prev_stats.recovery_stats[op])
overflow = true;
recovery_stats[op].count -= this.prev_stats.recovery_stats[op].count;
recovery_stats[op].bytes -= this.prev_stats.recovery_stats[op].bytes;
const object_counts = { object: 0n, clean: 0n, misplaced: 0n, degraded: 0n, incomplete: 0n };
for (const pg_num in
const st =[pg_num];
for (const k in object_counts)
if (st[k+'_count'])
object_counts[k] += BigInt(st[k+'_count']);
return (this.prev_stats = { overflow, op_stats, subop_stats, recovery_stats, object_counts });
async update_total_stats()
const stats = this.sum_stats();
if (!stats.overflow)
// Convert to strings, serialize and save
const ser = {};
for (const st of [ 'op_stats', 'subop_stats', 'recovery_stats' ])
ser[st] = {};
for (const op in stats[st])
ser[st][op] = {};
for (const k in stats[st][op])
ser[st][op][k] = ''+stats[st][op][k];
ser.object_counts = {};
for (const k in stats.object_counts)
ser.object_counts[k] = ''+stats.object_counts[k];
await this.etcd_call('/txn', {
success: [ { requestPut: { key: b64(this.etcd_prefix+'/stats'), value: b64(JSON.stringify(ser)) } } ],
}, this.config.etcd_mon_timeout, 0);
if (this.stats_timer)
this.stats_timer = null;
this.stats_timer = setTimeout(() =>
this.stats_timer = null;
}, this.config.mon_stats_timeout || 1000);
if (!kv || !kv.key)
kv.key = de64(kv.key);
kv.value = kv.value ? JSON.parse(de64(kv.value)) : null;
const key = kv.key.substr(this.etcd_prefix.length).replace(/^\/+/, '').split('/');
const cur = this.state, orig = Mon.etcd_tree;
for (let i = 0; i < key.length-1; i++)
if (!orig[key[i]])
console.log('Bad key in etcd: '+kv.key+' = '+kv.value);
orig = orig[key[i]];
cur = (cur[key[i]] = cur[key[i]] || {});
if (orig[key.length-1])
console.log('Bad key in etcd: '+kv.key+' = '+kv.value);
cur[key[key.length-1]] = kv.value;
if (key.join('/') === 'config/global')
{ = || {};
this.config =;
async etcd_call(path, body, timeout, retries)
let retry = 0;
if (retries >= 0 && retries < 1)
retries = 1;
while (retries < 0 || retry < retries)
const base = this.etcd_urls[Math.floor(Math.random()*this.etcd_urls.length)];
const res = await POST(base+path, body, timeout);
if (res.json)
if (res.json.error)
console.log('etcd returned error: '+res.json.error);
return res.json;
// In fact we can just try to rejoin
console.fatal(err || 'Cluster connection failed');
const ips = [];
const ifaces = os.networkInterfaces();
for (const ifname in ifaces)
for (const iface of ifaces[ifname])
if ( == 'IPv4' && !iface.internal)
return ips;
function POST(url, body, timeout)
return new Promise((ok, no) =>
const body_text = Buffer.from(JSON.stringify(body));
let timer_id = timeout > 0 ? setTimeout(() =>
if (req)
req = null;
ok({ error: 'timeout' });
}, timeout) : null;
let req = http.request(url, { method: 'POST', headers: {
'Content-Type': 'application/json',
'Content-Length': body_text,
} }, (res) =>
if (!req)
if (res.statusCode != 200)
ok({ error: res.statusCode, response: res });
let res_body = '';
res.on('data', chunk => { res_body += chunk });
res.on('end', () =>
res_body = JSON.parse(res_body);
ok({ response: res, json: res_body });
catch (e)
ok({ error: e, response: res, body: res_body });
function b64(str)
return Buffer.from(str).toString('base64');
function de64(str)
return Buffer.from(str, 'base64').toString();
function sha1hex(str)
const hash = crypto.createHash('sha1');
return hash.digest('hex');

View File

@ -1,15 +1,14 @@
{ {
"name": "vitastor-mon", "name": "rage-mon",
"version": "1.0.0", "version": "1.0.0",
"description": "Vitastor SDS monitor service", "description": "RAGE storage monitor service",
"main": "mon-main.js", "main": "mon.js",
"scripts": { "scripts": {
"test": "echo \"Error: no test specified\" && exit 1" "test": "echo \"Error: no test specified\" && exit 1"
}, },
"author": "Vitaliy Filippov", "author": "Vitaliy Filippov",
"license": "UNLICENSED", "license": "UNLICENSED",
"dependencies": { "dependencies": {
"sprintf-js": "^1.1.2",
"ws": "^7.2.5" "ws": "^7.2.5"
} }
} }

View File

@ -1,6 +1,3 @@
// Copyright (c) Vitaliy Filippov, 2019+
// License: VNPL-1.1 (see for details)
const LPOptimizer = require('./lp-optimizer.js'); const LPOptimizer = require('./lp-optimizer.js');
const crush_tree = [ const crush_tree = [
@ -43,31 +40,31 @@ async function run()
{ {
const cur_tree = {}; const cur_tree = {};
console.log('Empty tree:'); console.log('Empty tree:');
let res = await LPOptimizer.optimize_initial({ osd_tree: cur_tree, pg_size: 3, pg_count: 256 }); let res = await LPOptimizer.optimize_initial(cur_tree, 256);
LPOptimizer.print_change_stats(res, false); LPOptimizer.print_change_stats(res, false);
console.log('\nAdding 1st failure domain:'); console.log('\nAdding 1st failure domain:');
cur_tree['dom1'] = osd_tree['dom1']; cur_tree['dom1'] = osd_tree['dom1'];
res = await LPOptimizer.optimize_change({ prev_pgs: res.int_pgs, osd_tree: cur_tree, pg_size: 3 }); res = await LPOptimizer.optimize_change(res.int_pgs, cur_tree);
LPOptimizer.print_change_stats(res, false); LPOptimizer.print_change_stats(res, false);
console.log('\nAdding 2nd failure domain:'); console.log('\nAdding 2nd failure domain:');
cur_tree['dom2'] = osd_tree['dom2']; cur_tree['dom2'] = osd_tree['dom2'];
res = await LPOptimizer.optimize_change({ prev_pgs: res.int_pgs, osd_tree: cur_tree, pg_size: 3 }); res = await LPOptimizer.optimize_change(res.int_pgs, cur_tree);
LPOptimizer.print_change_stats(res, false); LPOptimizer.print_change_stats(res, false);
console.log('\nAdding 3rd failure domain:'); console.log('\nAdding 3rd failure domain:');
cur_tree['dom3'] = osd_tree['dom3']; cur_tree['dom3'] = osd_tree['dom3'];
res = await LPOptimizer.optimize_change({ prev_pgs: res.int_pgs, osd_tree: cur_tree, pg_size: 3 }); res = await LPOptimizer.optimize_change(res.int_pgs, cur_tree);
LPOptimizer.print_change_stats(res, false); LPOptimizer.print_change_stats(res, false);
console.log('\nRemoving 3rd failure domain:'); console.log('\nRemoving 3rd failure domain:');
delete cur_tree['dom3']; delete cur_tree['dom3'];
res = await LPOptimizer.optimize_change({ prev_pgs: res.int_pgs, osd_tree: cur_tree, pg_size: 3 }); res = await LPOptimizer.optimize_change(res.int_pgs, cur_tree);
LPOptimizer.print_change_stats(res, false); LPOptimizer.print_change_stats(res, false);
console.log('\nRemoving 2nd failure domain:'); console.log('\nRemoving 2nd failure domain:');
delete cur_tree['dom2']; delete cur_tree['dom2'];
res = await LPOptimizer.optimize_change({ prev_pgs: res.int_pgs, osd_tree: cur_tree, pg_size: 3 }); res = await LPOptimizer.optimize_change(res.int_pgs, cur_tree);
LPOptimizer.print_change_stats(res, false); LPOptimizer.print_change_stats(res, false);
console.log('\nRemoving 1st failure domain:'); console.log('\nRemoving 1st failure domain:');
delete cur_tree['dom1']; delete cur_tree['dom1'];
res = await LPOptimizer.optimize_change({ prev_pgs: res.int_pgs, osd_tree: cur_tree, pg_size: 3 }); res = await LPOptimizer.optimize_change(res.int_pgs, cur_tree);
LPOptimizer.print_change_stats(res, false); LPOptimizer.print_change_stats(res, false);
} }

View File

@ -1,6 +1,3 @@
// Copyright (c) Vitaliy Filippov, 2019+
// License: VNPL-1.1 (see for details)
const LPOptimizer = require('./lp-optimizer.js'); const LPOptimizer = require('./lp-optimizer.js');
const osd_tree = { const osd_tree = {
@ -78,37 +75,19 @@ const crush_tree = [
async function run() async function run()
{ {
let res;
// Test: add 1 OSD of almost the same size. Ideal data movement could be 1/12 = 8.33%. Actual is ~13% // Test: add 1 OSD of almost the same size. Ideal data movement could be 1/12 = 8.33%. Actual is ~13%
// Space efficiency is ~99% in all cases. // Space efficiency is ~99.5% in both cases.
let res = await LPOptimizer.optimize_initial(osd_tree, 256);
console.log('256 PGs, size=2');
res = await LPOptimizer.optimize_initial({ osd_tree, pg_size: 2, pg_count: 256 });
LPOptimizer.print_change_stats(res, false); LPOptimizer.print_change_stats(res, false);
console.log('\nAdding osd.8'); console.log('adding osd.8');
osd_tree[500][8] = 3.58589; osd_tree[500][8] = 3.58589;
res = await LPOptimizer.optimize_change({ prev_pgs: res.int_pgs, osd_tree, pg_size: 2 }); res = await LPOptimizer.optimize_change(res.int_pgs, osd_tree);
LPOptimizer.print_change_stats(res, false); LPOptimizer.print_change_stats(res, false);
console.log('\nRemoving osd.8'); console.log('removing osd.8');
delete osd_tree[500][8]; delete osd_tree[500][8];
res = await LPOptimizer.optimize_change({ prev_pgs: res.int_pgs, osd_tree, pg_size: 2 }); res = await LPOptimizer.optimize_change(res.int_pgs, osd_tree);
LPOptimizer.print_change_stats(res, false); LPOptimizer.print_change_stats(res, false);
res = await LPOptimizer.optimize_initial(LPOptimizer.flatten_tree(crush_tree, {}, 1, 3), 256);
console.log('\n256 PGs, size=3');
res = await LPOptimizer.optimize_initial({ osd_tree, pg_size: 3, pg_count: 256 });
LPOptimizer.print_change_stats(res, false);
console.log('\nAdding osd.8');
osd_tree[500][8] = 3.58589;
res = await LPOptimizer.optimize_change({ prev_pgs: res.int_pgs, osd_tree, pg_size: 3 });
LPOptimizer.print_change_stats(res, false);
console.log('\nRemoving osd.8');
delete osd_tree[500][8];
res = await LPOptimizer.optimize_change({ prev_pgs: res.int_pgs, osd_tree, pg_size: 3 });
LPOptimizer.print_change_stats(res, false);
console.log('\n256 PGs, size=3, failure domain=rack');
res = await LPOptimizer.optimize_initial({ osd_tree: LPOptimizer.flatten_tree(crush_tree, {}, 1, 3), pg_size: 3, pg_count: 256 });
LPOptimizer.print_change_stats(res, false); LPOptimizer.print_change_stats(res, false);
} }

View File

@ -1,104 +0,0 @@
// Copyright (c) Vitaliy Filippov, 2019+
// License: VNPL-1.1 (see for details)
module.exports = {
function add_pg_history(new_pg_history, new_pg, prev_pgs, prev_pg_history, old_pg)
if (!new_pg_history[new_pg])
new_pg_history[new_pg] = {
osd_sets: {},
all_peers: {},
epoch: 0,
const nh = new_pg_history[new_pg], oh = prev_pg_history[old_pg];
nh.osd_sets[prev_pgs[old_pg].join(' ')] = prev_pgs[old_pg];
if (oh && oh.osd_sets && oh.osd_sets.length)
for (const pg of oh.osd_sets)
nh.osd_sets[pg.join(' ')] = pg;
if (oh && oh.all_peers && oh.all_peers.length)
for (const osd_num of oh.all_peers)
nh.all_peers[osd_num] = Number(osd_num);
if (oh && oh.epoch)
nh.epoch = nh.epoch < oh.epoch ? oh.epoch : nh.epoch;
function finish_pg_history(merged_history)
merged_history.osd_sets = Object.values(merged_history.osd_sets);
merged_history.all_peers = Object.values(merged_history.all_peers);
function scale_pg_count(prev_pgs, prev_pg_history, new_pg_history, new_pg_count)
const old_pg_count = prev_pgs.length;
// Add all possibly intersecting PGs to the history of new PGs
if (!(new_pg_count % old_pg_count))
// New PG count is a multiple of old PG count
for (let i = 0; i < new_pg_count; i++)
add_pg_history(new_pg_history, i, prev_pgs, prev_pg_history, i % old_pg_count);
else if (!(old_pg_count % new_pg_count))
// Old PG count is a multiple of the new PG count
const mul = (old_pg_count / new_pg_count);
for (let i = 0; i < new_pg_count; i++)
for (let j = 0; j < mul; j++)
add_pg_history(new_pg_history, i, prev_pgs, prev_pg_history, i+j*new_pg_count);
// Any PG may intersect with any PG after non-multiple PG count change
// So, merge ALL PGs history
let merged_history = {};
for (let i = 0; i < old_pg_count; i++)
add_pg_history(merged_history, 1, prev_pgs, prev_pg_history, i);
for (let i = 0; i < new_pg_count; i++)
new_pg_history[i] = { ...merged_history[1] };
// Mark history keys for removed PGs as removed
for (let i = new_pg_count; i < old_pg_count; i++)
new_pg_history[i] = null;
// Just for the lp_solve optimizer - pick a "previous" PG for each "new" one
if (old_pg_count < new_pg_count)
for (let i = old_pg_count; i < new_pg_count; i++)
prev_pgs[i] = prev_pgs[i % old_pg_count];
else if (old_pg_count > new_pg_count)
prev_pgs.splice(new_pg_count, old_pg_count-new_pg_count);

// Functions to calculate Annualized Failure Rate of your cluster
// if you know AFR of your drives, number of drives, expected rebalance time
// and replication factor
// License: VNPL-1.1 (see for details) or AGPL-3.0
// Author: Vitaliy Filippov, 2020+
module.exports = {
// Estimate AFR of the cluster
// n - number of drives
// afr - annualized failure rate of a single drive
// l - expected rebalance time in days after a single drive failure
// k - replication factor / number of drives that must fail at the same time for the cluster to fail
function cluster_afr_fullmesh(n, afr, l, k)
return 1 - (1 - afr * failure_rate_fullmesh(n-(k-1), afr*l/365, k-1)) ** (n-(k-1));
// Probability of at least <f> failures in a cluster with <n> drives with AFR=<a>
function failure_rate_fullmesh(n, a, f)
if (f <= 0)
return (1-a)**n;
let p = 1;
for (let i = 0; i < f; i++)
p -= c_n_k(n, i) * (1-a)**(n-i) * a**i;
return p;
// <n> hosts of <m> drives of <capacity> GB, each able to backfill at <speed> GB/s,
// <k> replicas, <pgs> unique peer PGs per OSD (~50 for 100 PG-per-OSD in a big cluster)
// For each of n*m drives: P(drive fails in a year) * P(any of its peers fail in <l*365> next days).
// More peers per OSD increase rebalance speed (more drives work together to resilver) if you
// let them finish rebalance BEFORE replacing the failed drive (degraded_replacement=false).
// At the same time, more peers per OSD increase probability of any of them to fail!
// osd_rm=true means that failed OSDs' data is rebalanced over all other hosts,
// not over the same host as it's in Ceph by default (dead OSDs are marked 'out').
// Probability of all except one drives in a replica group to fail is (AFR^(k-1)).
// So with <x> PGs it becomes ~ (x * (AFR*L/365)^(k-1)). Interesting but reasonable consequence
// is that, with k=2, total failure rate doesn't depend on number of peers per OSD,
// because it gets increased linearly by increased number of peers to fail
// and decreased linearly by reduced rebalance time.
function cluster_afr({ n_hosts, n_drives, afr_drive, afr_host, capacity, speed, ec, ec_data, ec_parity, replicas, pgs = 1, osd_rm, degraded_replacement, down_out_interval = 600 })
const pg_size = (ec ? ec_data+ec_parity : replicas);
pgs = Math.min(pgs, (n_hosts-1)*n_drives/(pg_size-1));
const host_pgs = Math.min(pgs*n_drives, (n_hosts-1)*n_drives/(pg_size-1));
const resilver_disk = n_drives == 1 || osd_rm ? pgs : (n_drives-1);
const disk_heal_time = (down_out_interval + capacity/(degraded_replacement ? 1 : resilver_disk)/speed)/86400/365;
const host_heal_time = (down_out_interval + n_drives*capacity/pgs/speed)/86400/365;
const disk_heal_fail = ((afr_drive+afr_host/n_drives)*disk_heal_time);
const host_heal_fail = ((afr_drive+afr_host/n_drives)*host_heal_time);
const disk_pg_fail = ec
? failure_rate_fullmesh(ec_data+ec_parity-1, disk_heal_fail, ec_parity)
: disk_heal_fail**(replicas-1);
const host_pg_fail = ec
? failure_rate_fullmesh(ec_data+ec_parity-1, host_heal_fail, ec_parity)
: host_heal_fail**(replicas-1);
return 1 - ((1 - afr_drive * (1-(1-disk_pg_fail)**pgs)) ** (n_hosts*n_drives))
* ((1 - afr_host * (1-(1-host_pg_fail)**host_pgs)) ** n_hosts);
/******** UTILITY ********/
// Combination count
function c_n_k(n, k)
let r = 1;
for (let i = 0; i < k; i++)
r *= (n-i) / (i+1);
return r;

const { sprintf } = require('sprintf-js');
const { cluster_afr } = require('./afr.js');
print_cluster_afr({ n_hosts: 4, n_drives: 6, afr_drive: 0.03, afr_host: 0.05, capacity: 4000, speed: 0.1, replicas: 2 });
print_cluster_afr({ n_hosts: 4, n_drives: 3, afr_drive: 0.03, afr_host: 0, capacity: 4000, speed: 0.1, replicas: 2 });
print_cluster_afr({ n_hosts: 4, n_drives: 3, afr_drive: 0.03, afr_host: 0.05, capacity: 4000, speed: 0.1, replicas: 2 });
print_cluster_afr({ n_hosts: 4, n_drives: 3, afr_drive: 0.03, afr_host: 0, capacity: 4000, speed: 0.1, ec: true, ec_data: 2, ec_parity: 1 });
print_cluster_afr({ n_hosts: 4, n_drives: 3, afr_drive: 0.03, afr_host: 0.05, capacity: 4000, speed: 0.1, ec: true, ec_data: 2, ec_parity: 1 });
print_cluster_afr({ n_hosts: 10, n_drives: 10, afr_drive: 0.1, afr_host: 0, capacity: 8000, speed: 0.02, replicas: 2 });
print_cluster_afr({ n_hosts: 10, n_drives: 10, afr_drive: 0.1, afr_host: 0.05, capacity: 8000, speed: 0.02, replicas: 2 });
print_cluster_afr({ n_hosts: 10, n_drives: 10, afr_drive: 0.1, afr_host: 0, capacity: 8000, speed: 0.02, replicas: 3 });
print_cluster_afr({ n_hosts: 10, n_drives: 10, afr_drive: 0.1, afr_host: 0.05, capacity: 8000, speed: 0.02, replicas: 3 });
print_cluster_afr({ n_hosts: 10, n_drives: 10, afr_drive: 0.1, afr_host: 0, capacity: 8000, speed: 0.02, replicas: 3, pgs: 100 });
print_cluster_afr({ n_hosts: 10, n_drives: 10, afr_drive: 0.1, afr_host: 0.05, capacity: 8000, speed: 0.02, replicas: 3, pgs: 100 });
print_cluster_afr({ n_hosts: 10, n_drives: 10, afr_drive: 0.1, afr_host: 0.05, capacity: 8000, speed: 0.02, replicas: 3, pgs: 100, degraded_replacement: 1 });
function print_cluster_afr(config)
`${config.n_hosts} nodes with ${config.n_drives} ${sprintf("%.1f", config.capacity/1000)}TB drives`+
`, capable to backfill at ${sprintf("%.1f", config.speed*1000)} MB/s, drive AFR ${sprintf("%.1f", config.afr_drive*100)}%`+
(config.afr_host ? `, host AFR ${sprintf("%.1f", config.afr_host*100)}%` : '')+
( ? `, EC ${config.ec_data}+${config.ec_parity}` : `, ${config.replicas} replicas`)+
`, ${config.pgs||1} PG per OSD`+
(config.degraded_replacement ? `\n...and you don't let the rebalance finish before replacing drives` : '')
console.log('-> '+sprintf("%.7f%%", 100*cluster_afr(config))+'\n');

View File

# Very simple systemd unit generator for vitastor-osd services
# Not the final solution yet, mostly for tests
# Copyright (c) Vitaliy Filippov, 2019+
# License: MIT
# USAGE: ./ /dev/disk/by-partuuid/xxx [ /dev/disk/by-partuuid/yyy]...
set -e -x
IP=`ip -json a s | jq -r '.[].addr_info[] | select(.local | startswith("'$IP_SUBSTR'")) | .local'`
[ "$IP" != "" ] || exit 1
ETCD_MON=$(echo $ETCD_HOSTS | perl -pe 's/:2380/:2379/g; s/etcd\d*=//g;')
D=`dirname $0`
# Create OSDs on all passed devices
for DEV in $*; do
# Ugly :) -> node.js rework pending
while true; do
ST=$(etcdctl --endpoints="$ETCD_MON" get --print-value-only /vitastor/osd/stats/$OSD_NUM)
if [ "$ST" = "" ]; then
etcdctl --endpoints="$ETCD_MON" put /vitastor/osd/stats/$OSD_NUM '{}'
echo Creating OSD $OSD_NUM on $DEV
OPT=`node $D/simple-offsets.js --device $DEV --format options | tr '\n' ' '`
META=`echo $OPT | grep -Po '(?<=data_offset )\d+'`
dd if=/dev/zero of=$DEV bs=1048576 count=$(((META+1048575)/1048576)) oflag=direct
cat >/etc/systemd/system/vitastor-osd$OSD_NUM.service <<EOF
Description=Vitastor object storage daemon osd.$OSD_NUM
ExecStart=/usr/bin/vitastor-osd \\
--etcd_address $IP:2379/v3 \\
--bind_address $IP \\
--osd_num $OSD_NUM \\
--disable_data_fsync 1 \\
--immediate_commit all \\
--flusher_count 256 \\
--disk_alignment 4096 --journal_block_size 4096 --meta_block_size 4096 \\
--journal_no_same_sector_overwrites true \\
--journal_sector_buffer_count 1024 \\
ExecStartPre=+chown vitastor:vitastor $DEV
systemctl enable vitastor-osd$OSD_NUM

# Very simple systemd unit generator for etcd & vitastor-mon services
# Not the final solution yet, mostly for tests
# Copyright (c) Vitaliy Filippov, 2019+
# License: MIT
# USAGE: ./
# determine IP
IP=`ip -json a s | jq -r '.[].addr_info[] | select(.local | startswith("'$IP_SUBSTR'")) | .local'`
[ "$IP" != "" ] || exit 1
[ "$ETCD_NUM" != "$ETCD_HOSTS" ] || exit 1
ETCD_NUM=$(echo $ETCD_NUM | tr -d -c , | wc -c)
# etcd
useradd etcd
mkdir -p /var/lib/etcd$ETCD_NUM.etcd
cat >/etc/systemd/system/etcd.service <<EOF
Description=etcd for vitastor
ExecStart=/usr/local/bin/etcd -name etcd$ETCD_NUM --data-dir /var/lib/etcd$ETCD_NUM.etcd \\
--advertise-client-urls http://$IP:2379 --listen-client-urls http://$IP:2379 \\
--initial-advertise-peer-urls http://$IP:2380 --listen-peer-urls http://$IP:2380 \\
--initial-cluster-token vitastor-etcd-1 --initial-cluster $ETCD_HOSTS \\
--initial-cluster-state new --max-txn-ops=100000 --auto-compaction-retention=10 --auto-compaction-mode=revision
ExecStartPre=+chown -R etcd /var/lib/etcd$ETCD_NUM.etcd
systemctl daemon-reload
systemctl enable etcd
systemctl start etcd
useradd vitastor
chmod 755 /root
# Vitastor target
cat >/etc/systemd/system/ <<EOF
Description=vitastor target
# Monitor unit
ETCD_MON=$(echo $ETCD_HOSTS | perl -pe 's/:2380/:2379/g; s/etcd\d*=//g;')
cat >/etc/systemd/system/vitastor-mon.service <<EOF
Description=Vitastor monitor
ExecStart=node /usr/lib/vitastor/mon/mon-main.js --etcd_url '$ETCD_MON' --etcd_prefix '/vitastor' --etcd_start_timeout 5

const fsp = require('fs').promises;
async function merge(file1, file2, out)
if (!out)
console.error('USAGE: nodejs merge.js layer1 layer2 output');
const layer1 = await fsp.readFile(file1);
const layer2 = await fsp.readFile(file2);
const zero = Buffer.alloc(4096);
for (let i = 0; i < layer2.length; i += 4096)
if (, i, i+4096) != 0)
layer2.copy(layer1, i, i, i+4096);
await fsp.writeFile(out, layer1);
merge(process.argv[2], process.argv[3], process.argv[4]);


// Copyright (c) Vitaliy Filippov, 2019+
// License: MIT
// Simple tool to calculate journal and metadata offsets for a single device
// Will be replaced by smarter tools in the future
const fs = require('fs').promises;
const child_process = require('child_process');
async function run()
const options = {
object_size: 128*1024,
bitmap_granularity: 4096,
journal_size: 16*1024*1024,
device_block_size: 4096,
journal_offset: 0,
device_size: 0,
format: 'text',
for (let i = 2; i < process.argv.length; i++)
if (process.argv[i].substr(0, 2) == '--')
options[process.argv[i].substr(2)] = process.argv[i+1];
if (!options.device)
process.stderr.write('USAGE: nodejs '+process.argv[1]+' --device /dev/sdXXX\n');
options.device_size = Number(options.device_size);
let device_size = options.device_size;
if (!device_size)
const st = await fs.stat(options.device);
options.device_block_size = st.blksize;
if (st.isBlockDevice())
device_size = Number(await system("/sbin/blockdev --getsize64 "+options.device))
device_size = st.size;
if (!device_size)
process.stderr.write('Failed to get device size\n');
options.journal_offset = Math.ceil(options.journal_offset/options.device_block_size)*options.device_block_size;
const meta_offset = options.journal_offset + Math.ceil(options.journal_size/options.device_block_size)*options.device_block_size;
const entries_per_block = Math.floor(options.device_block_size / (24 + 2*options.object_size/options.bitmap_granularity/8));
const object_count = Math.floor((device_size-meta_offset)/options.object_size);
const meta_size = Math.ceil(object_count / entries_per_block) * options.device_block_size;
const data_offset = meta_offset + meta_size;
const meta_size_fmt = (meta_size > 1024*1024*1024 ? Math.round(meta_size/1024/1024/1024*100)/100+" GB"
: Math.round(meta_size/1024/1024*100)/100+" MB");
if (options.format == 'text' || options.format == 'options')
if (options.format == 'text')
`Metadata size: ${meta_size_fmt}\n`+
`Options for the OSD:\n`
` --data_device ${options.device}\n`+
` --journal_offset ${options.journal_offset}\n`+
` --meta_offset ${meta_offset}\n`+
` --data_offset ${data_offset}\n`+
(options.device_size ? ` --data_size ${device_size-data_offset}\n` : '')
else if (options.format == 'env')
process.stdout.write('Unknown format: '+options.format);
function system(cmd)
return new Promise((ok, no) => child_process.exec(cmd, { maxBuffer: 64*1024*1024 }, (err, stdout, stderr) => (err ? no(err.message) : ok(stdout))));
run().catch(err => { console.error(err); process.exit(1); });

// Copyright (c) Vitaliy Filippov, 2019+
// License: MIT
function stableStringify(obj, opts)
if (!opts)
opts = {};
if (typeof opts === 'function')
opts = { cmp: opts };
let space = || '';
if (typeof space === 'number')
space = Array(space+1).join(' ');
const cycles = (typeof opts.cycles === 'boolean') ? opts.cycles : false;
const cmp = opts.cmp && (function (f)
return function (node)
return function (a, b)
let aobj = { key: a, value: node[a] };
let bobj = { key: b, value: node[b] };
return f(aobj, bobj);
const seen = new Map();
return (function stringify (parent, key, node, level)
const indent = space ? ('\n' + new Array(level + 1).join(space)) : '';
const colonSeparator = space ? ': ' : ':';
if (node === undefined)
if (typeof node !== 'object' || node === null)
return JSON.stringify(node);
if (node instanceof Array)
const out = [];
for (let i = 0; i < node.length; i++)
const item = stringify(node, i, node[i], level+1) || JSON.stringify(null);
out.push(indent + space + item);
return '[' + out.join(',') + indent + ']';
if (seen.has(node))
if (cycles)
return JSON.stringify('__cycle__');
throw new TypeError('Converting circular structure to JSON');
seen.set(node, true);
const keys = Object.keys(node).sort(cmp && cmp(node));
const out = [];
for (let i = 0; i < keys.length; i++)
const key = keys[i];
const value = stringify(node, key, node[key], level+1);
if (!value)
const keyValue = JSON.stringify(key)
+ colonSeparator
+ value;
out.push(indent + space + keyValue);
return '{' + out.join(',') + indent + '}';
})({ '': obj }, '', obj, 0);
module.exports = stableStringify;

// Copyright (c) Vitaliy Filippov, 2019+
// License: VNPL-1.1 (see for details)
// Interesting real-world example coming from Ceph with EC and compression enabled.
// EC parity chunks can't be compressed as efficiently as data chunks,
// thus they occupy more space (2.26x more space) in OSD object stores.
// This leads to really uneven OSD fill ratio in Ceph even when PGs are perfectly balanced.
// But we support this case with the "parity_space" parameter in optimize_initial()/optimize_change().
const LPOptimizer = require('./lp-optimizer.js');
const osd_tree = {
ripper5: {
osd0: 3.493144989013672,
osd1: 3.493144989013672,
osd2: 3.454082489013672,
osd12: 3.461894989013672,
ripper7: {
osd4: 3.638690948486328,
osd5: 3.638690948486328,
osd6: 3.638690948486328,
ripper4: {
osd9: 3.4609375,
osd10: 3.4609375,
osd11: 3.4609375,
ripper6: {
osd3: 3.5849609375,
osd7: 3.5859336853027344,
osd8: 3.638690948486328,
osd13: 3.461894989013672
const prev_pgs = [[12,7,5],[6,11,12],[3,6,9],[10,0,5],[2,5,13],[9,8,6],[3,4,12],[7,4,12],[12,11,13],[13,6,0],[4,13,10],[9,7,6],[7,10,0],[10,8,0],[3,10,2],[3,0,4],[6,13,0],[13,10,0],[13,10,5],[8,11,6],[3,9,2],[2,8,5],[8,9,5],[3,12,11],[0,7,4],[13,11,1],[11,3,12],[12,8,10],[7,5,12],[2,13,5],[7,11,0],[13,2,6],[0,6,8],[13,1,6],[0,13,4],[0,8,10],[4,10,0],[8,12,4],[8,12,9],[12,7,4],[13,9,5],[3,2,11],[1,9,7],[1,8,5],[5,12,9],[3,5,12],[2,8,10],[0,8,4],[1,4,11],[7,10,2],[12,13,5],[3,1,11],[7,1,4],[4,12,8],[7,0,9],[11,1,8],[3,0,5],[11,13,0],[1,13,5],[12,7,10],[12,8,4],[11,13,5],[0,11,6],[2,11,3],[13,1,11],[2,7,10],[7,10,12],[7,12,10],[12,11,5],[13,12,10],[2,3,9],[4,3,9],[13,2,5],[7,12,6],[12,10,13],[9,8,1],[13,1,5],[9,5,12],[5,11,7],[6,2,9],[8,11,6],[12,5,8],[6,13,1],[7,6,11],[2,3,6],[8,5,9],[1,13,6],[9,3,2],[7,11,1],[3,10,1],[0,11,7],[3,0,5],[1,3,6],[6,0,9],[3,11,4],[8,10,2],[13,1,9],[12,6,9],[3,12,9],[12,8,9],[7,5,0],[8,12,5],[0,11,3],[12,11,13],[0,7,11],[0,3,10],[1,3,11],[2,7,11],[13,2,6],[9,12,13],[8,2,4],[0,7,4],[5,13,0],[13,12,9],[1,9,8],[0,10,3],[3,5,10],[7,12,9],[2,13,4],[12,7,5],[9,2,7],[3,2,9],[6,2,7],[3,1,9],[4,3,2],[5,3,11],[0,7,6],[1,6,13],[7,10,2],[12,4,8],[13,12,6],[7,5,11],[6,2,3],[2,7,6],[2,3,10],[2,7,10],[11,12,6],[0,13,5],[10,2,4],[13,0,11],[7,0,6],[8,9,4],[8,4,11],[7,11,2],[3,4,2],[6,1,3],[7,2,11],[8,9,4],[11,4,8],[10,3,1],[2,10,13],[1,7,11],[13,11,12],[2,6,9],[10,0,13],[7,10,4],[0,11,13],[13,10,1],[7,5,0],[7,12,10],[3,1,4],[7,1,5],[3,11,5],[7,5,0],[1,3,5],[10,5,12],[0,3,9],[7,1,11],[11,8,12],[3,6,2],[7,12,9],[7,11,12],[4,11,3],[0,11,13],[13,2,5],[1,5,8],[0,11,8],[3,5,1],[11,0,6],[3,11,2],[11,8,12],[4,1,3],[10,13,4],[13,9,6],[2,3,10],[12,7,9],[10,0,4],[10,13,2],[3,11,1],[7,2,9],[1,7,4],[13,1,4],[7,0,6],[5,3,9],[10,0,7],[0,7,10],[3,6,10],[13,0,5],[8,4,1],[3,1,10],[2,10,13],[13,0,5],[13,10,2],[12,7,9],[6,8,10],[6,1,8],[10,8,1],[13,5,0],[5,11,3],[7,6,1],[8,5,9],[2,13,11],[10,12,4],[13,4,1],[2,13,4],[11,7,0],[2,9,7],[1,7,6],[8,0,4],[8,1,9],[7,10,12],[13,9,6],[7,6,11],[13,0,4],[1,8,4],[3,12,5],[10,3,1],[10,2,13],[2,4,8],[6,2,3],[3,0,10],[6,7,12],[8,12,5],[3,0,6],[13,12,10],[11,3,6],[9,0,13],[10,0,6],[7,5,2],[1,3,11],[7,10,2],[2,9,8],[11,13,12],[0,8,4],[8,12,11],[6,0,3],[1,13,4],[11,8,2],[12,3,6],[4,7,1],[7,6,12],[3,10,6],[0,10,7],[8,9,1],[0,10,6],[8,10,1]]
.map(pg => => 'osd'+n));
const by_osd = {};
for (let i = 0; i < prev_pgs.length; i++)
for (let j = 0; j < prev_pgs[i].length; j++)
by_osd[prev_pgs[i][j]] = by_osd[prev_pgs[i][j]] || [];
by_osd[prev_pgs[i][j]][j] = (by_osd[prev_pgs[i][j]][j] || 0) + 1;
This set of PGs was balanced by hand, by heavily tuning OSD weights in Ceph:
osd0: 4.2,
osd1: 3.5,
osd2: 3.45409,
osd3: 4.5,
osd4: 1.4,
osd5: 1.4,
osd6: 1.75,
osd7: 4.5,
osd8: 4.4,
osd9: 2.2,
osd10: 2.7,
osd11: 2,
osd12: 3.4,
osd13: 3.4,
EC+compression is a nightmare in Ceph, yeah :))
To calculate the average ratio between data chunks and parity chunks we
calculate the number of PG chunks for each chunk role for each OSD:
osd12: [ 18, 22, 17 ],
osd7: [ 35, 22, 8 ],
osd5: [ 6, 17, 27 ],
osd6: [ 13, 12, 28 ],
osd11: [ 13, 26, 20 ],
osd3: [ 30, 20, 10 ],
osd9: [ 8, 12, 26 ],
osd10: [ 15, 23, 20 ],
osd0: [ 22, 22, 14 ],
osd2: [ 22, 16, 16 ],
osd13: [ 29, 19, 13 ],
osd8: [ 20, 18, 12 ],
osd4: [ 8, 10, 28 ],
osd1: [ 17, 17, 17 ]
And now we can pick a pair of OSDs and determine the ratio by solving the following:
osd5 = 23*X + 27*Y = 3249728140
osd13 = 48*X + 13*Y = 2991675992
osd5 - 27/13*osd13 = 23*X - 27/13*48*X = -76.6923076923077*X = -2963752766.46154
X = 38644720.1243731
Y = (osd5-23*X)/27 = 87440725.0792377
Y/X = 2.26268232239284 ~= 2.26
Which means that parity chunks are compressed ~2.26 times worse than data chunks.
Fine, let's try to optimize for it.
async function run()
const all_weights = Object.assign({}, ...Object.values(osd_tree));
const total_weight = Object.values(all_weights).reduce((a, c) => Number(a) + Number(c), 0);
const eff = LPOptimizer.pg_list_space_efficiency(prev_pgs, all_weights, 2, 2.26);
const orig = eff*4.26 / total_weight;
console.log('Original efficiency was: '+Math.round(orig*10000)/100+' %');
let prev = await LPOptimizer.optimize_initial({ osd_tree, pg_size: 3, pg_count: 256, parity_space: 2.26 });
let next = await LPOptimizer.optimize_change({ prev_pgs, osd_tree, pg_size: 3, max_combinations: 10000, parity_space: 2.26 });

// Copyright (c) Vitaliy Filippov, 2019+
// License: VNPL-1.1 (see for details)
const LPOptimizer = require('./lp-optimizer.js');
async function run()
const osd_tree = { a: { 1: 1 }, b: { 2: 1 }, c: { 3: 1 } };
let res;
console.log('16 PGs, size=3');
res = await LPOptimizer.optimize_initial({ osd_tree, pg_size: 3, pg_count: 16 });
LPOptimizer.print_change_stats(res, false);
console.log('\nReduce PG size to 2');
res = await LPOptimizer.optimize_change({ prev_pgs: => pg.slice(0, 2)), osd_tree, pg_size: 2 });
LPOptimizer.print_change_stats(res, false);
console.log('\nRemove OSD 3');
delete osd_tree['c'];
res = await LPOptimizer.optimize_change({ prev_pgs: res.int_pgs, osd_tree, pg_size: 2 });
LPOptimizer.print_change_stats(res, false);

// Copyright (c) Vitaliy Filippov, 2019+
// License: VNPL-1.1 or GNU GPL-2.0+ (see for details)
#pragma once #pragma once
#include <stdint.h> #include <stdint.h>
#include <functional> #include <functional>
typedef uint64_t inode_t;
// 16 bytes per object/stripe id // 16 bytes per object/stripe id
// stripe = (start of the parity stripe + peer role) // stripe = (start of the parity stripe + peer role)
// i.e. for example (256KB + one of 0,1,2) // i.e. for example (256KB + one of 0,1,2)
struct __attribute__((__packed__)) object_id struct __attribute__((__packed__)) object_id
{ {
inode_t inode; uint64_t inode;
uint64_t stripe; uint64_t stripe;
}; };

@ -1,7 +1,5 @@
// Copyright (c) Vitaliy Filippov, 2019+
// License: VNPL-1.1 (see for details)
#include <sys/socket.h> #include <sys/socket.h>
#include <sys/epoll.h>
#include <sys/poll.h> #include <sys/poll.h>
#include <netinet/in.h> #include <netinet/in.h>
#include <netinet/tcp.h> #include <netinet/tcp.h>
@ -9,41 +7,51 @@
#include "osd.h" #include "osd.h"
osd_t::osd_t(blockstore_config_t & config, ring_loop_t *ringloop) const char* osd_op_names[] = {
{ "",
bs_block_size = strtoull(config["block_size"].c_str(), NULL, 10); "read",
bs_bitmap_granularity = strtoull(config["bitmap_granularity"].c_str(), NULL, 10); "write",
if (!bs_block_size) "sync",
bs_block_size = DEFAULT_BLOCK_SIZE; "stabilize",
if (!bs_bitmap_granularity) "rollback",
bs_bitmap_granularity = DEFAULT_BITMAP_GRANULARITY; "delete",
clean_entry_bitmap_size = bs_block_size / bs_bitmap_granularity / 8; "sync_stab_all",
osd_t::osd_t(blockstore_config_t & config, blockstore_t *bs, ring_loop_t *ringloop)
this->config = config; this->config = config;
this->bs = bs;
this->ringloop = ringloop; this->ringloop = ringloop;
// FIXME: Create Blockstore from on-disk superblock config and check it against the OSD cluster config this->bs_block_size = bs->get_block_size();
this->bs = new blockstore_t(config, ringloop); // FIXME: use bitmap granularity instead
this->bs_disk_alignment = bs->get_disk_alignment();
parse_config(config); parse_config(config);
epmgr = new epoll_manager_t(ringloop); epoll_fd = epoll_create(1);
this->tfd = epmgr->tfd; if (epoll_fd < 0)
throw std::runtime_error(std::string("epoll_create: ") + strerror(errno));
this->tfd = new timerfd_manager_t([this](int fd, bool out, std::function<void(int, int)> handler) { set_fd_handler(fd, out, handler); });
this->tfd->set_timer(print_stats_interval*1000, true, [this](int timer_id) this->tfd->set_timer(print_stats_interval*1000, true, [this](int timer_id)
{ {
print_stats(); print_stats();
}); });
this->tfd->set_timer(slow_log_interval*1000, true, [this](int timer_id)
c_cli.tfd = this->tfd; c_cli.tfd = this->tfd;
c_cli.ringloop = this->ringloop; c_cli.ringloop = this->ringloop;
c_cli.exec_op = [this](osd_op_t *op) { exec_op(op); }; c_cli.exec_op = [this](osd_op_t *op) { exec_op(op); };
c_cli.repeer_pgs = [this](osd_num_t peer_osd) { repeer_pgs(peer_osd); }; c_cli.repeer_pgs = [this](osd_num_t peer_osd) { repeer_pgs(peer_osd); };
init_cluster(); init_cluster();
@ -53,20 +61,41 @@ osd_t::osd_t(blockstore_config_t & config, ring_loop_t *ringloop)
osd_t::~osd_t() osd_t::~osd_t()
{ {
if (tfd)
delete tfd;
tfd = NULL;
ringloop->unregister_consumer(&consumer); ringloop->unregister_consumer(&consumer);
delete epmgr; close(epoll_fd);
delete bs;
close(listen_fd); close(listen_fd);
} }
void osd_t::parse_config(blockstore_config_t & config) void osd_t::parse_config(blockstore_config_t & config)
{ {
if (config.find("log_level") == config.end()) int pos;
config["log_level"] = "1";
log_level = strtoull(config["log_level"].c_str(), NULL, 10);
// Initial startup configuration // Initial startup configuration
json11::Json json_config = json11::Json(config); {
st_cli.parse_config(json_config); std::string ea = config["etcd_address"];
while (1)
pos = ea.find(',');
std::string addr = pos >= 0 ? ea.substr(0, pos) : ea;
if (addr.length() > 0)
if (addr.find('/') < 0)
addr += "/v3";
if (pos >= 0)
ea = ea.substr(pos+1);
st_cli.etcd_prefix = config["etcd_prefix"];
if (st_cli.etcd_prefix == "")
st_cli.etcd_prefix = "/microceph";
etcd_report_interval = strtoull(config["etcd_report_interval"].c_str(), NULL, 10); etcd_report_interval = strtoull(config["etcd_report_interval"].c_str(), NULL, 10);
if (etcd_report_interval <= 0) if (etcd_report_interval <= 0)
etcd_report_interval = 30; etcd_report_interval = 30;
@ -75,8 +104,6 @@ void osd_t::parse_config(blockstore_config_t & config)
throw std::runtime_error("osd_num is required in the configuration"); throw std::runtime_error("osd_num is required in the configuration");
c_cli.osd_num = osd_num; c_cli.osd_num = osd_num;
run_primary = config["run_primary"] != "false" && config["run_primary"] != "0" && config["run_primary"] != "no"; run_primary = config["run_primary"] != "false" && config["run_primary"] != "0" && config["run_primary"] != "no";
no_rebalance = config["no_rebalance"] == "true" || config["no_rebalance"] == "1" || config["no_rebalance"] == "yes";
no_recovery = config["no_recovery"] == "true" || config["no_recovery"] == "1" || config["no_recovery"] == "yes";
// Cluster configuration // Cluster configuration
bind_address = config["bind_address"]; bind_address = config["bind_address"];
if (bind_address == "") if (bind_address == "")
@ -100,21 +127,29 @@ void osd_t::parse_config(blockstore_config_t & config)
if (client_queue_depth < 128) if (client_queue_depth < 128)
client_queue_depth = 128; client_queue_depth = 128;
} }
if (config.find("pg_stripe_size") != config.end())
pg_stripe_size = strtoull(config["pg_stripe_size"].c_str(), NULL, 10);
if (!pg_stripe_size || !bs_block_size || pg_stripe_size < bs_block_size || (pg_stripe_size % bs_block_size) != 0)
pg_stripe_size = DEFAULT_PG_STRIPE_SIZE;
recovery_queue_depth = strtoull(config["recovery_queue_depth"].c_str(), NULL, 10); recovery_queue_depth = strtoull(config["recovery_queue_depth"].c_str(), NULL, 10);
if (recovery_queue_depth < 1 || recovery_queue_depth > MAX_RECOVERY_QUEUE) if (recovery_queue_depth < 1 || recovery_queue_depth > MAX_RECOVERY_QUEUE)
recovery_queue_depth = DEFAULT_RECOVERY_QUEUE; recovery_queue_depth = DEFAULT_RECOVERY_QUEUE;
recovery_sync_batch = strtoull(config["recovery_sync_batch"].c_str(), NULL, 10);
if (recovery_sync_batch < 1 || recovery_sync_batch > MAX_RECOVERY_QUEUE)
recovery_sync_batch = DEFAULT_RECOVERY_BATCH;
if (config["readonly"] == "true" || config["readonly"] == "1" || config["readonly"] == "yes") if (config["readonly"] == "true" || config["readonly"] == "1" || config["readonly"] == "yes")
readonly = true; readonly = true;
print_stats_interval = strtoull(config["print_stats_interval"].c_str(), NULL, 10); print_stats_interval = strtoull(config["print_stats_interval"].c_str(), NULL, 10);
if (!print_stats_interval) if (!print_stats_interval)
print_stats_interval = 3; print_stats_interval = 3;
slow_log_interval = strtoull(config["slow_log_interval"].c_str(), NULL, 10); c_cli.peer_connect_interval = strtoull(config["peer_connect_interval"].c_str(), NULL, 10);
if (!slow_log_interval) if (!c_cli.peer_connect_interval)
slow_log_interval = 10; c_cli.peer_connect_interval = 5;
c_cli.parse_config(json_config); c_cli.peer_connect_timeout = strtoull(config["peer_connect_timeout"].c_str(), NULL, 10);
if (!c_cli.peer_connect_timeout)
c_cli.peer_connect_timeout = 5;
log_level = strtoull(config["log_level"].c_str(), NULL, 10);
st_cli.log_level = log_level;
c_cli.log_level = log_level;
} }
void osd_t::bind_socket() void osd_t::bind_socket()
@ -165,10 +200,15 @@ void osd_t::bind_socket()
fcntl(listen_fd, F_SETFL, fcntl(listen_fd, F_GETFL, 0) | O_NONBLOCK); fcntl(listen_fd, F_SETFL, fcntl(listen_fd, F_GETFL, 0) | O_NONBLOCK);
epmgr->set_fd_handler(listen_fd, false, [this](int fd, int events) epoll_event ev; = listen_fd; = EPOLLIN;
if (epoll_ctl(epoll_fd, EPOLL_CTL_ADD, listen_fd, &ev) < 0)
{ {
c_cli.accept_connections(listen_fd); close(listen_fd);
}); close(epoll_fd);
throw std::runtime_error(std::string("epoll_ctl: ") + strerror(errno));
} }
bool osd_t::shutdown() bool osd_t::shutdown()
@ -178,17 +218,124 @@ bool osd_t::shutdown()
{ {
return false; return false;
} }
return !bs || bs->is_safe_to_stop(); return bs->is_safe_to_stop();
} }
void osd_t::loop() void osd_t::loop()
{ {
if (!wait_state)
wait_state = 1;
handle_peers(); handle_peers();
c_cli.read_requests(); c_cli.read_requests();
c_cli.send_replies(); c_cli.send_replies();
ringloop->submit(); ringloop->submit();
} }
void osd_t::set_fd_handler(int fd, bool out, std::function<void(int, int)> handler)
if (handler != NULL)
bool exists = epoll_handlers.find(fd) != epoll_handlers.end();
epoll_event ev; = fd; = EPOLLIN | (out ? EPOLLOUT : 0) | EPOLLRDHUP;
if (epoll_ctl(epoll_fd, exists ? EPOLL_CTL_MOD : EPOLL_CTL_ADD, fd, &ev) < 0)
throw std::runtime_error(std::string("epoll_ctl: ") + strerror(errno));
epoll_handlers[fd] = handler;
if (epoll_ctl(epoll_fd, EPOLL_CTL_DEL, fd, NULL) < 0 && errno != ENOENT)
throw std::runtime_error(std::string("epoll_ctl: ") + strerror(errno));
void osd_t::handle_epoll_events()
wait_state = 0;
timespec now;
clock_gettime(CLOCK_REALTIME, &now);
printf("get_sqe %s %d %ld.%06ld\n", __FILE__, __LINE__, now.tv_sec, now.tv_nsec/1000);
io_uring_sqe *sqe = ringloop->get_sqe();
if (!sqe)
wait_state = 1;
ring_data_t *data = ((ring_data_t*)sqe->user_data);
my_uring_prep_poll_add(sqe, epoll_fd, POLLIN);
data->callback = [this](ring_data_t *data)
if (data->res < 0)
throw std::runtime_error(std::string("epoll failed: ") + strerror(-data->res));
int nfds;
epoll_event events[MAX_EPOLL_EVENTS];
nfds = epoll_wait(epoll_fd, events, MAX_EPOLL_EVENTS, 0);
for (int i = 0; i < nfds; i++)
if (events[i].data.fd == listen_fd)
// Accept new connections
sockaddr_in addr;
socklen_t peer_addr_size = sizeof(addr);
int peer_fd;
while ((peer_fd = accept(listen_fd, (sockaddr*)&addr, &peer_addr_size)) >= 0)
assert(peer_fd != 0);
char peer_str[256];
printf("[OSD %lu] new client %d: connection from %s port %d\n", this->osd_num, peer_fd,
inet_ntop(AF_INET, &addr.sin_addr, peer_str, 256), ntohs(addr.sin_port));
fcntl(peer_fd, F_SETFL, fcntl(listen_fd, F_GETFL, 0) | O_NONBLOCK);
int one = 1;
setsockopt(peer_fd, SOL_TCP, TCP_NODELAY, &one, sizeof(one));
c_cli.clients[peer_fd] = {
.peer_addr = addr,
.peer_port = ntohs(addr.sin_port),
.peer_fd = peer_fd,
.peer_state = PEER_CONNECTED,
.in_buf = malloc(c_cli.receive_buffer_size),
// Add FD to epoll
set_fd_handler(peer_fd, false, [this](int peer_fd, int epoll_events)
c_cli.handle_peer_epoll(peer_fd, epoll_events);
// Try to accept next connection
peer_addr_size = sizeof(addr);
if (peer_fd == -1 && errno != EAGAIN)
throw std::runtime_error(std::string("accept: ") + strerror(errno));
auto & cb = epoll_handlers[events[i].data.fd];
cb(events[i].data.fd, events[i].events);
printf("%d events\n", nfds);
if (nfds == MAX_EPOLL_EVENTS)
goto restart;
void osd_t::exec_op(osd_op_t *cur_op) void osd_t::exec_op(osd_op_t *cur_op)
{ {
clock_gettime(CLOCK_REALTIME, &cur_op->tv_begin); clock_gettime(CLOCK_REALTIME, &cur_op->tv_begin);
@ -199,34 +346,21 @@ void osd_t::exec_op(osd_op_t *cur_op)
return; return;
} }
inflight_ops++; inflight_ops++;
cur_op->send_list.push_back(cur_op->reply.buf, OSD_PACKET_SIZE);
if (cur_op->req.hdr.magic != SECONDARY_OSD_OP_MAGIC || if (cur_op->req.hdr.magic != SECONDARY_OSD_OP_MAGIC ||
cur_op->req.hdr.opcode < OSD_OP_MIN || cur_op->req.hdr.opcode > OSD_OP_MAX || cur_op->req.hdr.opcode < OSD_OP_MIN || cur_op->req.hdr.opcode > OSD_OP_MAX ||
((cur_op->req.hdr.opcode == OSD_OP_SEC_READ || (cur_op->req.hdr.opcode == OSD_OP_SECONDARY_READ || cur_op->req.hdr.opcode == OSD_OP_SECONDARY_WRITE) &&
cur_op->req.hdr.opcode == OSD_OP_SEC_WRITE || (cur_op->req.sec_rw.len > OSD_RW_MAX || cur_op->req.sec_rw.len % bs_disk_alignment || cur_op->req.sec_rw.offset % bs_disk_alignment) ||
cur_op->req.hdr.opcode == OSD_OP_SEC_WRITE_STABLE) && (cur_op->req.hdr.opcode == OSD_OP_READ || cur_op->req.hdr.opcode == OSD_OP_WRITE || cur_op->req.hdr.opcode == OSD_OP_DELETE) &&
(cur_op->req.sec_rw.len > OSD_RW_MAX || (cur_op-> > OSD_RW_MAX || cur_op-> % bs_disk_alignment || cur_op-> % bs_disk_alignment))
cur_op->req.sec_rw.len % bs_bitmap_granularity ||
cur_op->req.sec_rw.offset % bs_bitmap_granularity)) ||
((cur_op->req.hdr.opcode == OSD_OP_READ ||
cur_op->req.hdr.opcode == OSD_OP_WRITE ||
cur_op->req.hdr.opcode == OSD_OP_DELETE) &&
(cur_op-> > OSD_RW_MAX ||
cur_op-> % bs_bitmap_granularity ||
cur_op-> % bs_bitmap_granularity)))
{ {
// Bad command // Bad command
finish_op(cur_op, -EINVAL); finish_op(cur_op, -EINVAL);
return; return;
} }
if (cur_op->req.hdr.opcode == OSD_OP_PING)
// Pong
finish_op(cur_op, 0);
if (readonly && if (readonly &&
cur_op->req.hdr.opcode != OSD_OP_SEC_READ && cur_op->req.hdr.opcode != OSD_OP_SECONDARY_READ &&
cur_op->req.hdr.opcode != OSD_OP_SEC_LIST && cur_op->req.hdr.opcode != OSD_OP_SECONDARY_LIST &&
cur_op->req.hdr.opcode != OSD_OP_READ && cur_op->req.hdr.opcode != OSD_OP_READ &&
cur_op->req.hdr.opcode != OSD_OP_SHOW_CONFIG) cur_op->req.hdr.opcode != OSD_OP_SHOW_CONFIG)
{ {
@ -274,9 +408,9 @@ void osd_t::reset_stats()
void osd_t::print_stats() void osd_t::print_stats()
{ {
for (int i = OSD_OP_MIN; i <= OSD_OP_MAX; i++) for (int i = 0; i <= OSD_OP_MAX; i++)
{ {
if (c_cli.stats.op_stat_count[i] != prev_stats.op_stat_count[i] && i != OSD_OP_PING) if (c_cli.stats.op_stat_count[i] != prev_stats.op_stat_count[i])
{ {
uint64_t avg = (c_cli.stats.op_stat_sum[i] - prev_stats.op_stat_sum[i])/(c_cli.stats.op_stat_count[i] - prev_stats.op_stat_count[i]); uint64_t avg = (c_cli.stats.op_stat_sum[i] - prev_stats.op_stat_sum[i])/(c_cli.stats.op_stat_count[i] - prev_stats.op_stat_count[i]);
uint64_t bw = (c_cli.stats.op_stat_bytes[i] - prev_stats.op_stat_bytes[i]) / print_stats_interval; uint64_t bw = (c_cli.stats.op_stat_bytes[i] - prev_stats.op_stat_bytes[i]) / print_stats_interval;
@ -297,7 +431,7 @@ void osd_t::print_stats()
prev_stats.op_stat_bytes[i] = c_cli.stats.op_stat_bytes[i]; prev_stats.op_stat_bytes[i] = c_cli.stats.op_stat_bytes[i];
} }
} }
for (int i = OSD_OP_MIN; i <= OSD_OP_MAX; i++) for (int i = 0; i <= OSD_OP_MAX; i++)
{ {
if (c_cli.stats.subop_stat_count[i] != prev_stats.subop_stat_count[i]) if (c_cli.stats.subop_stat_count[i] != prev_stats.subop_stat_count[i])
{ {
@ -335,73 +469,3 @@ void osd_t::print_stats()
printf("[OSD %lu] %lu object(s) misplaced\n", osd_num, misplaced_objects); printf("[OSD %lu] %lu object(s) misplaced\n", osd_num, misplaced_objects);
} }
} }
void osd_t::print_slow()
char alloc[1024];
timespec now;
clock_gettime(CLOCK_REALTIME, &now);
for (auto & kv: c_cli.clients)
for (auto op: kv.second->received_ops)
if ((now.tv_sec - op->tv_begin.tv_sec) >= slow_log_interval)
int l = sizeof(alloc), n;
char *buf = alloc;
#define bufprintf(s, ...) { n = snprintf(buf, l, s, __VA_ARGS__); n = n < 0 ? 0 : n; buf += n; l -= n; }
bufprintf("[OSD %lu] Slow op", osd_num);
if (kv.second->osd_num)
bufprintf(" from peer OSD %lu (client %d)", kv.second->osd_num, kv.second->peer_fd);
bufprintf(" from client %d", kv.second->peer_fd);
bufprintf(": %s id=%lu", osd_op_names[op->req.hdr.opcode], op->;
if (op->req.hdr.opcode == OSD_OP_SEC_READ || op->req.hdr.opcode == OSD_OP_SEC_WRITE ||
op->req.hdr.opcode == OSD_OP_SEC_WRITE_STABLE || op->req.hdr.opcode == OSD_OP_SEC_DELETE)
bufprintf(" %lx:%lx v", op->req.sec_rw.oid.inode, op->req.sec_rw.oid.stripe);
if (op->req.sec_rw.version == UINT64_MAX)
bufprintf("%s", "max");
bufprintf("%lu", op->req.sec_rw.version);
if (op->req.hdr.opcode != OSD_OP_SEC_DELETE)
bufprintf(" offset=%x len=%x", op->req.sec_rw.offset, op->req.sec_rw.len);
else if (op->req.hdr.opcode == OSD_OP_SEC_STABILIZE || op->req.hdr.opcode == OSD_OP_SEC_ROLLBACK)
for (uint64_t i = 0; i < op->req.sec_stab.len; i += sizeof(obj_ver_id))
obj_ver_id *ov = (obj_ver_id*)(op->buf + i);
bufprintf(i == 0 ? " %lx:%lx v%lu" : ", %lx:%lx v%lu", ov->oid.inode, ov->oid.stripe, ov->version);
else if (op->req.hdr.opcode == OSD_OP_SEC_LIST)
" inode=%lx-%lx pg=%u/%u, stripe=%lu",
op->req.sec_list.min_inode, op->req.sec_list.max_inode,
op->req.sec_list.list_pg, op->req.sec_list.pg_count,
else if (op->req.hdr.opcode == OSD_OP_READ || op->req.hdr.opcode == OSD_OP_WRITE ||
op->req.hdr.opcode == OSD_OP_DELETE)
bufprintf(" inode=%lx offset=%lx len=%x", op->, op->, op->;
#undef bufprintf
printf("%s\n", alloc);

// Copyright (c) Vitaliy Filippov, 2019+
// License: VNPL-1.1 (see for details)
#pragma once #pragma once
#include <sys/types.h> #include <sys/types.h>
@ -19,9 +16,8 @@
#include "blockstore.h" #include "blockstore.h"
#include "ringloop.h" #include "ringloop.h"
#include "timerfd_manager.h" #include "timerfd_manager.h"
#include "epoll_manager.h"
#include "osd_peering_pg.h" #include "osd_peering_pg.h"
#include "messenger.h" #include "cluster_client.h"
#include "etcd_state_client.h" #include "etcd_state_client.h"
#define OSD_LOADING_PGS 0x01 #define OSD_LOADING_PGS 0x01
@ -37,10 +33,12 @@
#define DEFAULT_RECOVERY_BATCH 16 #define DEFAULT_PG_STRIPE_SIZE 4*1024*1024 // 4 MB by default
//#define OSD_STUB //#define OSD_STUB
extern const char* osd_op_names[];
struct osd_object_id_t struct osd_object_id_t
{ {
osd_num_t osd_num; osd_num_t osd_num;
@ -51,21 +49,11 @@ struct osd_recovery_op_t
{ {
int st = 0; int st = 0;
bool degraded = false; bool degraded = false;
pg_num_t pg_num = 0;
object_id oid = { 0 }; object_id oid = { 0 };
osd_op_t *osd_op = NULL; osd_op_t *osd_op = NULL;
}; };
// Posted as /osd/inodestats/$osd, then accumulated by the monitor
struct inode_stats_t
uint64_t op_sum[3] = { 0 };
uint64_t op_count[3] = { 0 };
uint64_t op_bytes[3] = { 0 };
class osd_t class osd_t
{ {
// config // config
@ -76,45 +64,38 @@ class osd_t
bool readonly = false; bool readonly = false;
osd_num_t osd_num = 1; // OSD numbers start with 1 osd_num_t osd_num = 1; // OSD numbers start with 1
bool run_primary = false; bool run_primary = false;
bool no_rebalance = false;
bool no_recovery = false;
std::string bind_address; std::string bind_address;
int bind_port, listen_backlog; int bind_port, listen_backlog;
// FIXME: Implement client queue depth limit // FIXME: Implement client queue depth limit
int client_queue_depth = 128; int client_queue_depth = 128;
bool allow_test_ops = true; bool allow_test_ops = true;
int print_stats_interval = 3; int print_stats_interval = 3;
int slow_log_interval = 10;
int immediate_commit = IMMEDIATE_NONE; int immediate_commit = IMMEDIATE_NONE;
int autosync_interval = DEFAULT_AUTOSYNC_INTERVAL; // sync every 5 seconds int autosync_interval = DEFAULT_AUTOSYNC_INTERVAL; // sync every 5 seconds
int recovery_queue_depth = DEFAULT_RECOVERY_QUEUE; int recovery_queue_depth = DEFAULT_RECOVERY_QUEUE;
int recovery_sync_batch = DEFAULT_RECOVERY_BATCH;
int log_level = 0; int log_level = 0;
// cluster state // cluster state
etcd_state_client_t st_cli; etcd_state_client_t st_cli;
osd_messenger_t c_cli; cluster_client_t c_cli;
int etcd_failed_attempts = 0; int etcd_failed_attempts = 0;
std::string etcd_lease_id; std::string etcd_lease_id;
json11::Json self_state; json11::Json self_state;
bool loading_peer_config = false; bool loading_peer_config = false;
std::set<pool_pg_num_t> pg_state_dirty; std::set<pg_num_t> pg_state_dirty;
bool pg_config_applied = false; bool pg_config_applied = false;
bool etcd_reporting_pg_state = false; bool etcd_reporting_pg_state = false;
bool etcd_reporting_stats = false; bool etcd_reporting_stats = false;
// peers and PGs // peers and PGs
std::map<pool_id_t, pg_num_t> pg_counts; std::map<pg_num_t, pg_t> pgs;
std::map<pool_pg_num_t, pg_t> pgs; std::set<pg_num_t> dirty_pgs;
std::set<pool_pg_num_t> dirty_pgs;
std::set<osd_num_t> dirty_osds;
int copies_to_delete_after_sync_count = 0;
uint64_t misplaced_objects = 0, degraded_objects = 0, incomplete_objects = 0; uint64_t misplaced_objects = 0, degraded_objects = 0, incomplete_objects = 0;
int peering_state = 0; int peering_state = 0;
unsigned pg_count = 0;
std::map<object_id, osd_recovery_op_t> recovery_ops; std::map<object_id, osd_recovery_op_t> recovery_ops;
int recovery_done = 0;
osd_op_t *autosync_op = NULL; osd_op_t *autosync_op = NULL;
// Unstable writes // Unstable writes
@ -126,18 +107,20 @@ class osd_t
bool stopping = false; bool stopping = false;
int inflight_ops = 0; int inflight_ops = 0;
blockstore_t *bs; blockstore_t *bs;
uint32_t bs_block_size, bs_bitmap_granularity, clean_entry_bitmap_size; uint32_t bs_block_size, bs_disk_alignment;
uint64_t pg_stripe_size = DEFAULT_PG_STRIPE_SIZE;
ring_loop_t *ringloop; ring_loop_t *ringloop;
timerfd_manager_t *tfd = NULL; timerfd_manager_t *tfd = NULL;
epoll_manager_t *epmgr = NULL;
int wait_state = 0;
int epoll_fd = 0;
int listening_port = 0; int listening_port = 0;
int listen_fd = 0; int listen_fd = 0;
ring_consumer_t consumer; ring_consumer_t consumer;
std::map<int, std::function<void(int, int)>> epoll_handlers;
// op statistics // op statistics
osd_op_stats_t prev_stats; osd_op_stats_t prev_stats;
std::map<uint64_t, inode_stats_t> inode_stats;
const char* recovery_stat_names[2] = { "degraded", "misplaced" }; const char* recovery_stat_names[2] = { "degraded", "misplaced" };
uint64_t recovery_stat_count[2][2] = { 0 }; uint64_t recovery_stat_count[2][2] = { 0 };
uint64_t recovery_stat_bytes[2][2] = { 0 }; uint64_t recovery_stat_bytes[2][2] = { 0 };
@ -145,8 +128,7 @@ class osd_t
// cluster connection // cluster connection
void parse_config(blockstore_config_t & config); void parse_config(blockstore_config_t & config);
void init_cluster(); void init_cluster();
void on_change_osd_state_hook(osd_num_t peer_osd); void on_change_osd_state_hook(uint64_t osd_num);
void on_change_pg_history_hook(pool_id_t pool_id, pg_num_t pg_num);
void on_change_etcd_state_hook(json11::Json::object & changes); void on_change_etcd_state_hook(json11::Json::object & changes);
void on_load_config_hook(json11::Json::object & changes); void on_load_config_hook(json11::Json::object & changes);
json11::Json on_load_pgs_checks_hook(); json11::Json on_load_pgs_checks_hook();
@ -157,7 +139,6 @@ class osd_t
void create_osd_state(); void create_osd_state();
void renew_lease(); void renew_lease();
void print_stats(); void print_stats();
void print_slow();
void reset_stats(); void reset_stats();
json11::Json get_statistics(); json11::Json get_statistics();
void report_statistics(); void report_statistics();
@ -168,23 +149,24 @@ class osd_t
// event loop, socket read/write // event loop, socket read/write
void loop(); void loop();
void set_fd_handler(int fd, bool out, std::function<void(int, int)> handler);
void handle_epoll_events();
// peer handling (primary OSD logic) // peer handling (primary OSD logic)
void parse_test_peer(std::string peer); void parse_test_peer(std::string peer);
void handle_peers(); void handle_peers();
void repeer_pgs(osd_num_t osd_num); void repeer_pgs(osd_num_t osd_num);
void start_pg_peering(pg_t & pg); void start_pg_peering(pg_num_t pg_num);
void submit_sync_and_list_subop(osd_num_t role_osd, pg_peering_state_t *ps); void submit_sync_and_list_subop(osd_num_t role_osd, pg_peering_state_t *ps);
void submit_list_subop(osd_num_t role_osd, pg_peering_state_t *ps); void submit_list_subop(osd_num_t role_osd, pg_peering_state_t *ps);
void discard_list_subop(osd_op_t *list_op); void discard_list_subop(osd_op_t *list_op);
bool stop_pg(pg_t & pg); bool stop_pg(pg_num_t pg_num);
void reset_pg(pg_t & pg);
void finish_stop_pg(pg_t & pg); void finish_stop_pg(pg_t & pg);
// flushing, recovery and backfill // flushing, recovery and backfill
void submit_pg_flush_ops(pg_t & pg); void submit_pg_flush_ops(pg_num_t pg_num);
void handle_flush_op(bool rollback, pool_id_t pool_id, pg_num_t pg_num, pg_flush_batch_t *fb, osd_num_t peer_osd, int retval); void handle_flush_op(bool rollback, pg_num_t pg_num, pg_flush_batch_t *fb, osd_num_t peer_osd, int retval);
void submit_flush_op(pool_id_t pool_id, pg_num_t pg_num, pg_flush_batch_t *fb, bool rollback, osd_num_t peer_osd, int count, obj_ver_id *data); void submit_flush_op(pg_num_t pg_num, pg_flush_batch_t *fb, bool rollback, osd_num_t peer_osd, int count, obj_ver_id *data);
bool pick_next_recovery(osd_recovery_op_t &op); bool pick_next_recovery(osd_recovery_op_t &op);
void submit_recovery_op(osd_recovery_op_t *op); void submit_recovery_op(osd_recovery_op_t *op);
bool continue_recovery(); bool continue_recovery();
@ -205,32 +187,27 @@ class osd_t
bool prepare_primary_rw(osd_op_t *cur_op); bool prepare_primary_rw(osd_op_t *cur_op);
void continue_primary_read(osd_op_t *cur_op); void continue_primary_read(osd_op_t *cur_op);
void continue_primary_write(osd_op_t *cur_op); void continue_primary_write(osd_op_t *cur_op);
void cancel_primary_write(osd_op_t *cur_op);
void continue_primary_sync(osd_op_t *cur_op); void continue_primary_sync(osd_op_t *cur_op);
void continue_primary_del(osd_op_t *cur_op); void continue_primary_del(osd_op_t *cur_op);
bool check_write_queue(osd_op_t *cur_op, pg_t & pg); bool check_write_queue(osd_op_t *cur_op, pg_t & pg);
void remove_object_from_state(object_id & oid, pg_osd_set_state_t *object_state, pg_t &pg); void remove_object_from_state(object_id & oid, pg_osd_set_state_t *object_state, pg_t &pg);
bool remember_unstable_write(osd_op_t *cur_op, pg_t & pg, pg_osd_set_t & loc_set, int base_state); bool finalize_primary_write(osd_op_t *cur_op, pg_t & pg, pg_osd_set_t & loc_set, int base_state);
void handle_primary_subop(osd_op_t *subop, osd_op_t *cur_op); void handle_primary_subop(uint64_t opcode, osd_op_t *cur_op, int retval, int expected, uint64_t version);
void handle_primary_bs_subop(osd_op_t *subop); void handle_primary_bs_subop(osd_op_t *subop);
void add_bs_subop_stats(osd_op_t *subop); void add_bs_subop_stats(osd_op_t *subop);
void pg_cancel_write_queue(pg_t & pg, osd_op_t *first_op, object_id oid, int retval); void pg_cancel_write_queue(pg_t & pg, object_id oid, int retval);
void submit_primary_subops(int submit_type, uint64_t op_version, int pg_size, const uint64_t* osd_set, osd_op_t *cur_op); void submit_primary_subops(int submit_type, int read_pg_size, const uint64_t* osd_set, osd_op_t *cur_op);
void submit_primary_del_subops(osd_op_t *cur_op, uint64_t *cur_set, uint64_t set_size, pg_osd_set_t & loc_set); void submit_primary_del_subops(osd_op_t *cur_op, uint64_t *cur_set, pg_osd_set_t & loc_set);
void submit_primary_del_batch(osd_op_t *cur_op, obj_ver_osd_t *chunks_to_delete, int chunks_to_delete_count);
void submit_primary_sync_subops(osd_op_t *cur_op); void submit_primary_sync_subops(osd_op_t *cur_op);
void submit_primary_stab_subops(osd_op_t *cur_op); void submit_primary_stab_subops(osd_op_t *cur_op);
inline pg_num_t map_to_pg(object_id oid, uint64_t pg_stripe_size) inline pg_num_t map_to_pg(object_id oid)
{ {
uint64_t pg_count = pg_counts[INODE_POOL(oid.inode)];
if (!pg_count)
pg_count = 1;
return (oid.inode + oid.stripe / pg_stripe_size) % pg_count + 1; return (oid.inode + oid.stripe / pg_stripe_size) % pg_count + 1;
} }
public: public:
osd_t(blockstore_config_t & config, ring_loop_t *ringloop); osd_t(blockstore_config_t & config, blockstore_t *bs, ring_loop_t *ringloop);
~osd_t(); ~osd_t();
void force_stop(int exitcode); void force_stop(int exitcode);
bool shutdown(); bool shutdown();

void slice()
// Slice the request into blockstore requests to individual objects
// Primary OSD still operates individual stripes, except they're twice the size of the blockstore's stripe.
std::vector read_parts;
int block = bs->get_block_size();
uint64_t stripe1 = cur_op-> / block / 2;
uint64_t stripe2 = (cur_op-> + cur_op-> + block*2 - 1) / block / 2 - 1;
for (uint64_t s = stripe1; s <= stripe2; s++)
uint64_t start = s == stripe1 ? cur_op-> - stripe1*block*2 : 0;
uint64_t end = s == stripe2 ? cur_op-> + cur_op-> - stripe2*block*2 : block*2;
if (start < block)
.role = 1,
.oid = {
.inode = cur_op->,
.stripe = (s << STRIPE_ROLE_BITS) | 1,
.version = UINT64_MAX,
.offset = start,
.len = (block < end ? block : end) - start,
if (end > block)
.role = 2,
.oid = {
.inode = cur_op->,
.stripe = (s << STRIPE_ROLE_BITS) | 2,
.version = UINT64_MAX,
.offset = (start > block ? start-block : 0),
.len = end - (start > block ? start-block : 0),

View File

@ -1,10 +1,6 @@
// Copyright (c) Vitaliy Filippov, 2019+
// License: VNPL-1.1 (see for details)
#include "osd.h" #include "osd.h"
#include "base64.h" #include "base64.h"
#include "etcd_state_client.h" #include "etcd_state_client.h"
#include "osd_rmw.h"
// Startup sequence: // Startup sequence:
// Start etcd watcher -> Load global OSD configuration -> Bind socket -> Acquire lease -> Report&lock OSD state // Start etcd watcher -> Load global OSD configuration -> Bind socket -> Acquire lease -> Report&lock OSD state
@ -18,7 +14,7 @@ void osd_t::init_cluster()
{ {
if (run_primary) if (run_primary)
{ {
// Test version of clustering code with 1 pool, 1 PG and 2 peers // Test version of clustering code with 1 PG and 2 peers
// Example: peers = 2:,3: // Example: peers = 2:,3:
std::string peerstr = config["peers"]; std::string peerstr = config["peers"];
while (peerstr.size()) while (peerstr.size())
@ -31,30 +27,15 @@ void osd_t::init_cluster()
{ {
throw std::runtime_error("run_primary requires at least 2 peers"); throw std::runtime_error("run_primary requires at least 2 peers");
} }
pgs[{ 1, 1 }] = (pg_t){ pgs[1] = (pg_t){
.state = PG_PEERING, .state = PG_PEERING,
.scheme = POOL_SCHEME_XOR,
.pg_cursize = 0, .pg_cursize = 0,
.pg_size = 3,
.pg_minsize = 2,
.pg_data_size = 2,
.pool_id = 1,
.pg_num = 1, .pg_num = 1,
.target_set = { 1, 2, 3 }, .target_set = { 1, 2, 3 },
.cur_set = { 0, 0, 0 }, .cur_set = { 0, 0, 0 },
}; };
st_cli.pool_config[1] = (pool_config_t){ report_pg_state(pgs[1]);
.exists = true, pg_count = 1;
.id = 1,
.name = "testpool",
.scheme = POOL_SCHEME_XOR,
.pg_size = 3,
.pg_minsize = 2,
.pg_count = 1,
.real_pg_count = 1,
report_pg_state(pgs[{ 1, 1 }]);
pg_counts[1] = 1;
} }
bind_socket(); bind_socket();
} }
@ -62,8 +43,7 @@ void osd_t::init_cluster()
{ {
st_cli.tfd = tfd; st_cli.tfd = tfd;
st_cli.log_level = log_level; st_cli.log_level = log_level;
st_cli.on_change_osd_state_hook = [this](osd_num_t peer_osd) { on_change_osd_state_hook(peer_osd); }; st_cli.on_change_osd_state_hook = [this](uint64_t peer_osd) { on_change_osd_state_hook(peer_osd); };
st_cli.on_change_pg_history_hook = [this](pool_id_t pool_id, pg_num_t pg_num) { on_change_pg_history_hook(pool_id, pg_num); };
st_cli.on_change_hook = [this](json11::Json::object & changes) { on_change_etcd_state_hook(changes); }; st_cli.on_change_hook = [this](json11::Json::object & changes) { on_change_etcd_state_hook(changes); };
st_cli.on_load_config_hook = [this](json11::Json::object & cfg) { on_load_config_hook(cfg); }; st_cli.on_load_config_hook = [this](json11::Json::object & cfg) { on_load_config_hook(cfg); };
st_cli.load_pgs_checks_hook = [this]() { return on_load_pgs_checks_hook(); }; st_cli.load_pgs_checks_hook = [this]() { return on_load_pgs_checks_hook(); };
@ -103,7 +83,7 @@ void osd_t::parse_test_peer(std::string peer)
{ "addresses", json11::Json::array { addr } }, { "addresses", json11::Json::array { addr } },
{ "port", port }, { "port", port },
}; };
c_cli.connect_peer(peer_osd, st_cli.peer_states[peer_osd]); c_cli.connect_peer(peer_osd, json11::Json::array { addr }, port);
} }
json11::Json osd_t::get_osd_state() json11::Json osd_t::get_osd_state()
@ -142,7 +122,7 @@ json11::Json osd_t::get_statistics()
} }
st["host"] = self_state["host"]; st["host"] = self_state["host"];
json11::Json::object op_stats, subop_stats; json11::Json::object op_stats, subop_stats;
for (int i = OSD_OP_MIN; i <= OSD_OP_MAX; i++) for (int i = 0; i <= OSD_OP_MAX; i++)
{ {
op_stats[osd_op_names[i]] = json11::Json::object { op_stats[osd_op_names[i]] = json11::Json::object {
{ "count", c_cli.stats.op_stat_count[i] }, { "count", c_cli.stats.op_stat_count[i] },
@ -150,7 +130,7 @@ json11::Json osd_t::get_statistics()
{ "bytes", c_cli.stats.op_stat_bytes[i] }, { "bytes", c_cli.stats.op_stat_bytes[i] },
}; };
} }
for (int i = OSD_OP_MIN; i <= OSD_OP_MAX; i++) for (int i = 0; i <= OSD_OP_MAX; i++)
{ {
subop_stats[osd_op_names[i]] = json11::Json::object { subop_stats[osd_op_names[i]] = json11::Json::object {
{ "count", c_cli.stats.subop_stat_count[i] }, { "count", c_cli.stats.subop_stat_count[i] },
@ -179,47 +159,11 @@ void osd_t::report_statistics()
return; return;
} }
etcd_reporting_stats = true; etcd_reporting_stats = true;
// Report space usage statistics as a whole
// Maybe we'll report it using deltas if we tune for a lot of inodes at some point
json11::Json::object inode_space;
for (auto kv: bs->get_inode_space_stats())
inode_space[std::to_string(kv.first)] = kv.second;
json11::Json::object inode_ops;
for (auto kv: inode_stats)
inode_ops[std::to_string(kv.first)] = json11::Json::object {
{ "read", json11::Json::object {
{ "count", kv.second.op_count[INODE_STATS_READ] },
{ "usec", kv.second.op_sum[INODE_STATS_READ] },
{ "bytes", kv.second.op_bytes[INODE_STATS_READ] },
} },
{ "write", json11::Json::object {
{ "count", kv.second.op_count[INODE_STATS_WRITE] },
{ "usec", kv.second.op_sum[INODE_STATS_WRITE] },
{ "bytes", kv.second.op_bytes[INODE_STATS_WRITE] },
} },
{ "delete", json11::Json::object {
{ "count", kv.second.op_count[INODE_STATS_DELETE] },
{ "usec", kv.second.op_sum[INODE_STATS_DELETE] },
{ "bytes", kv.second.op_bytes[INODE_STATS_DELETE] },
} },
json11::Json::array txn = { json11::Json::object { json11::Json::array txn = { json11::Json::object {
{ "request_put", json11::Json::object { { "request_put", json11::Json::object {
{ "key", base64_encode(st_cli.etcd_prefix+"/osd/stats/"+std::to_string(osd_num)) }, { "key", base64_encode(st_cli.etcd_prefix+"/osd/stats/"+std::to_string(osd_num)) },
{ "value", base64_encode(get_statistics().dump()) }, { "value", base64_encode(get_statistics().dump()) },
} }, } }
{ "request_put", json11::Json::object {
{ "key", base64_encode(st_cli.etcd_prefix+"/osd/space/"+std::to_string(osd_num)) },
{ "value", base64_encode(json11::Json(inode_space).dump()) },
} },
{ "request_put", json11::Json::object {
{ "key", base64_encode(st_cli.etcd_prefix+"/osd/inodestats/"+std::to_string(osd_num)) },
{ "value", base64_encode(json11::Json(inode_ops).dump()) },
} },
} }; } };
for (auto & p: pgs) for (auto & p: pgs)
{ {
@ -238,7 +182,7 @@ void osd_t::report_statistics()
pg_stats["write_osd_set"] = pg.cur_set; pg_stats["write_osd_set"] = pg.cur_set;
txn.push_back(json11::Json::object { txn.push_back(json11::Json::object {
{ "request_put", json11::Json::object { { "request_put", json11::Json::object {
{ "key", base64_encode(st_cli.etcd_prefix+"/pg/stats/"+std::to_string(pg.pool_id)+"/"+std::to_string(pg.pg_num)) }, { "key", base64_encode(st_cli.etcd_prefix+"/pg/stats/"+std::to_string(pg.pg_num)) },
{ "value", base64_encode(json11::Json(pg_stats).dump()) }, { "value", base64_encode(json11::Json(pg_stats).dump()) },
} } } }
}); });
@ -263,46 +207,19 @@ void osd_t::report_statistics()
}); });
} }
void osd_t::on_change_osd_state_hook(osd_num_t peer_osd) void osd_t::on_change_osd_state_hook(uint64_t peer_osd)
{ {
if (c_cli.wanted_peers.find(peer_osd) != c_cli.wanted_peers.end()) if (c_cli.wanted_peers.find(peer_osd) != c_cli.wanted_peers.end())
{ {
c_cli.connect_peer(peer_osd, st_cli.peer_states[peer_osd]); c_cli.connect_peer(peer_osd, st_cli.peer_states[peer_osd]["addresses"], st_cli.peer_states[peer_osd]["port"].int64_value());
} }
} }
void osd_t::on_change_etcd_state_hook(json11::Json::object & changes) void osd_t::on_change_etcd_state_hook(json11::Json::object & changes)
{ {
// FIXME apply config changes in runtime (maybe, some) // FIXME apply config changes in runtime (maybe, some)
if (run_primary) apply_pg_count();
{ apply_pg_config();
void osd_t::on_change_pg_history_hook(pool_id_t pool_id, pg_num_t pg_num)
auto pg_it = pgs.find({
.pool_id = pool_id,
.pg_num = pg_num,
if (pg_it != pgs.end() && pg_it->second.epoch > pg_it->second.reported_epoch &&
st_cli.pool_config[pool_id].pg_config[pg_num].epoch >= pg_it->second.epoch)
pg_it->second.reported_epoch = st_cli.pool_config[pool_id].pg_config[pg_num].epoch;
object_id oid = { 0 };
bool first = true;
for (auto op: pg_it->second.write_queue)
if (first || oid != op.first)
oid = op.first;
first = false;
} }
void osd_t::on_load_config_hook(json11::Json::object & global_config) void osd_t::on_load_config_hook(json11::Json::object & global_config)
@ -312,18 +229,13 @@ void osd_t::on_load_config_hook(json11::Json::object & global_config)
{ {
if (this->config.find(cfg_var.first) == this->config.end()) if (this->config.find(cfg_var.first) == this->config.end())
{ {
if (cfg_var.second.is_string()) // FIXME Convert int to str
{ osd_config[cfg_var.first] = cfg_var.second.string_value();
osd_config[cfg_var.first] = cfg_var.second.string_value();
osd_config[cfg_var.first] = cfg_var.second.dump();
} }
} }
parse_config(osd_config); parse_config(osd_config);
bind_socket(); bind_socket();
acquire_lease(); acquire_lease();
} }
@ -420,7 +332,6 @@ void osd_t::create_osd_state()
{ {
st_cli.load_pgs(); st_cli.load_pgs();
} }
}); });
} }
@ -512,169 +423,149 @@ void osd_t::on_load_pgs_hook(bool success)
void osd_t::apply_pg_count() void osd_t::apply_pg_count()
{ {
for (auto & pool_item: st_cli.pool_config) pg_num_t pg_count = st_cli.pg_config.size();
if (pg_count > 0 && (st_cli.pg_config.begin()->first != 1 || std::prev(st_cli.pg_config.end())->first != pg_count))
{ {
if (pool_item.second.real_pg_count != 0 && printf("Invalid PG configuration: PG numbers don't cover the whole 1..%d range\n", pg_count);
pool_item.second.real_pg_count != pg_counts[pool_item.first]) force_stop(1);
if (this->pg_count != 0 && this->pg_count != pg_count)
// Check that all PGs are offline. It is not allowed to change PG count when any PGs are online
// The external tool must wait for all PGs to come down before changing PG count
// If it doesn't wait, a restarted OSD may apply the new count immediately which will lead to bugs
// So an OSD just dies if it detects PG count change while there are active PGs
int still_active = 0;
for (auto & kv: pgs)
{ {
// Check that all pool PGs are offline. It is not allowed to change PG count when any PGs are online if (kv.second.state & PG_ACTIVE)
// The external tool must wait for all PGs to come down before changing PG count
// If it doesn't wait, a restarted OSD may apply the new count immediately which will lead to bugs
// So an OSD just dies if it detects PG count change while there are active PGs
int still_active = 0;
for (auto & kv: pgs)
{ {
if (kv.first.pool_id == pool_item.first && (kv.second.state & PG_ACTIVE)) still_active++;
if (still_active > 0)
"[OSD %lu] PG count change detected for pool %u (new is %lu, old is %u),"
" but %u PG(s) are still active. This is not allowed. Exiting\n",
this->osd_num, pool_item.first, pool_item.second.real_pg_count, pg_counts[pool_item.first], still_active
} }
} }
this->pg_counts[pool_item.first] = pool_item.second.real_pg_count; if (still_active > 0)
printf("[OSD %lu] PG count change detected, but %d PG(s) are still active. This is not allowed. Exiting\n", this->osd_num, still_active);
} }
this->pg_count = pg_count;
} }
void osd_t::apply_pg_config() void osd_t::apply_pg_config()
{ {
bool all_applied = true; bool all_applied = true;
for (auto & pool_item: st_cli.pool_config) for (auto & kv: st_cli.pg_config)
{ {
auto pool_id = pool_item.first; pg_num_t pg_num = kv.first;
for (auto & kv: pool_item.second.pg_config) auto & pg_cfg = kv.second;
bool take = pg_cfg.exists && pg_cfg.primary == this->osd_num &&
!pg_cfg.pause && (!pg_cfg.cur_primary || pg_cfg.cur_primary == this->osd_num);
bool currently_taken = this->pgs.find(pg_num) != this->pgs.end() &&
this->pgs[pg_num].state != PG_OFFLINE;
if (currently_taken && !take)
{ {
pg_num_t pg_num = kv.first; // Stop this PG
auto & pg_cfg = kv.second; stop_pg(pg_num);
bool take = pg_cfg.exists && pg_cfg.primary == this->osd_num && }
!pg_cfg.pause && (!pg_cfg.cur_primary || pg_cfg.cur_primary == this->osd_num); else if (take)
auto pg_it = this->pgs.find({ .pool_id = pool_id, .pg_num = pg_num }); {
bool currently_taken = pg_it != this->pgs.end() && pg_it->second.state != PG_OFFLINE; // Take this PG
if (currently_taken && !take) std::set<osd_num_t> all_peers;
for (osd_num_t pg_osd: pg_cfg.target_set)
{ {
// Stop this PG if (pg_osd != 0)
stop_pg(pg_it->second); {
} }
else if (take) for (osd_num_t pg_osd: pg_cfg.all_peers)
{ {
// Take this PG if (pg_osd != 0)
std::set<osd_num_t> all_peers; {
for (osd_num_t pg_osd: pg_cfg.target_set) all_peers.insert(pg_osd);
for (auto & hist_item: pg_cfg.target_history)
for (auto pg_osd: hist_item)
{ {
if (pg_osd != 0) if (pg_osd != 0)
{ {
all_peers.insert(pg_osd); all_peers.insert(pg_osd);
} }
} }
for (osd_num_t pg_osd: pg_cfg.all_peers) }
if (currently_taken)
if (this->pgs[pg_num].state & (PG_ACTIVE | PG_INCOMPLETE | PG_PEERING))
{ {
if (pg_osd != 0) if (this->pgs[pg_num].target_set == pg_cfg.target_set)
{ {
all_peers.insert(pg_osd); // No change in osd_set; history changes are ignored
for (auto & hist_item: pg_cfg.target_history)
for (auto pg_osd: hist_item)
if (pg_osd != 0)
if (currently_taken)
if (pg_it->second.state & (PG_ACTIVE | PG_INCOMPLETE | PG_PEERING))
if (pg_it->second.target_set == pg_cfg.target_set)
// No change in osd_set; history changes are ignored
// Stop PG, reapply change after stopping
all_applied = false;
else if (pg_it->second.state & PG_STOPPING)
// Reapply change after stopping
all_applied = false;
continue; continue;
} }
else if (pg_it->second.state & PG_STARTING)
if (pg_cfg.cur_primary == this->osd_num)
// PG locked, continue
// Reapply change after locking the PG
all_applied = false;
else else
{ {
throw std::runtime_error( // Stop PG, reapply change after stopping
"Unexpected PG "+std::to_string(pool_id)+"/"+std::to_string(pg_num)+ stop_pg(pg_num);
" state: "+std::to_string(pg_it->second.state) all_applied = false;
); continue;
} }
} }
auto & pg = this->pgs[{ .pool_id = pool_id, .pg_num = pg_num }]; else if (this->pgs[pg_num].state & PG_STOPPING)
pg = (pg_t){
.state = pg_cfg.cur_primary == this->osd_num ? PG_PEERING : PG_STARTING,
.scheme = pool_item.second.scheme,
.pg_cursize = 0,
.pg_size = pool_item.second.pg_size,
.pg_minsize = pool_item.second.pg_minsize,
.pg_data_size = pg.scheme == POOL_SCHEME_REPLICATED
? 1 : pool_item.second.pg_size - pool_item.second.parity_chunks,
.pool_id = pool_id,
.pg_num = pg_num,
.reported_epoch = pg_cfg.epoch,
.target_history = pg_cfg.target_history,
.all_peers = std::vector<osd_num_t>(all_peers.begin(), all_peers.end()),
.target_set = pg_cfg.target_set,
if (pg.scheme == POOL_SCHEME_JERASURE)
{ {
use_jerasure(pg.pg_size, pg.pg_data_size, true); // Reapply change after stopping
all_applied = false;
} }
this->pg_state_dirty.insert({ .pool_id = pool_id, .pg_num = pg_num }); else if (this->pgs[pg_num].state & PG_STARTING)
if (pg_cfg.cur_primary == this->osd_num)
{ {
// Add peers if (pg_cfg.cur_primary == this->osd_num)
for (auto pg_osd: all_peers)
{ {
if (pg_osd != this->osd_num && c_cli.osd_peer_fds.find(pg_osd) == c_cli.osd_peer_fds.end()) // PG locked, continue
{ }
c_cli.connect_peer(pg_osd, st_cli.peer_states[pg_osd]); else
} {
// Reapply change after locking the PG
all_applied = false;
} }
} }
else else
{ {
// Reapply change after locking the PG throw std::runtime_error("Unexpected PG "+std::to_string(pg_num)+" state: "+std::to_string(this->pgs[pg_num].state));
all_applied = false;
} }
} }
this->pgs[pg_num] = (pg_t){
.state = pg_cfg.cur_primary == this->osd_num ? PG_PEERING : PG_STARTING,
.pg_cursize = 0,
.pg_num = pg_num,
.target_history = pg_cfg.target_history,
.all_peers = std::vector<osd_num_t>(all_peers.begin(), all_peers.end()),
.target_set = pg_cfg.target_set,
if (pg_cfg.cur_primary == this->osd_num)
// Add peers
for (auto pg_osd: all_peers)
if (pg_osd != this->osd_num && c_cli.osd_peer_fds.find(pg_osd) == c_cli.osd_peer_fds.end())
c_cli.connect_peer(pg_osd, st_cli.peer_states[pg_osd]["addresses"], st_cli.peer_states[pg_osd]["port"].int64_value());
// Reapply change after locking the PG
all_applied = false;
} }
} }
report_pg_states(); report_pg_states();
@ -687,7 +578,8 @@ void osd_t::report_pg_states()
{ {
return; return;
} }
std::vector<std::pair<pool_pg_num_t,bool>> reporting_pgs; etcd_reporting_pg_state = true;
std::vector<std::pair<pg_num_t,bool>> reporting_pgs;
json11::Json::array checks; json11::Json::array checks;
json11::Json::array success; json11::Json::array success;
json11::Json::array failure; json11::Json::array failure;
@ -699,23 +591,9 @@ void osd_t::report_pg_states()
continue; continue;
} }
auto & pg = pg_it->second; auto & pg = pg_it->second;
reporting_pgs.push_back({ *it, pg.history_changed }); reporting_pgs.push_back({ pg.pg_num, pg.history_changed });
std::string state_key_base64 = base64_encode(st_cli.etcd_prefix+"/pg/state/"+std::to_string(pg.pool_id)+"/"+std::to_string(pg.pg_num)); std::string state_key_base64 = base64_encode(st_cli.etcd_prefix+"/pg/state/"+std::to_string(pg.pg_num));
bool pg_state_exists = false; if (pg.state == PG_STARTING)
if (pg.state != PG_STARTING)
auto pool_it = st_cli.pool_config.find(pg.pool_id);
if (pool_it != st_cli.pool_config.end())
auto pg_it = pool_it->second.pg_config.find(pg.pg_num);
if (pg_it != pool_it->second.pg_config.end() &&
pg_it->second.cur_state != 0)
pg_state_exists = true;
if (!pg_state_exists)
{ {
// Check that the PG key does not exist // Check that the PG key does not exist
// Failed check indicates an unsuccessful PG lock attempt in this case // Failed check indicates an unsuccessful PG lock attempt in this case
@ -727,7 +605,9 @@ void osd_t::report_pg_states()
} }
else else
{ {
// Check that the key is ours if it already exists // Check that the key is ours
// Failed check indicates success for OFFLINE pgs (PG lock is already deleted)
// and an unexpected race condition for started pgs (PG lock is held by someone else)
checks.push_back(json11::Json::object { checks.push_back(json11::Json::object {
{ "target", "LEASE" }, { "target", "LEASE" },
{ "lease", etcd_lease_id }, { "lease", etcd_lease_id },
@ -754,7 +634,7 @@ void osd_t::report_pg_states()
} }
success.push_back(json11::Json::object { success.push_back(json11::Json::object {
{ "request_put", json11::Json::object { { "request_put", json11::Json::object {
{ "key", state_key_base64 }, { "key", base64_encode(st_cli.etcd_prefix+"/pg/state/"+std::to_string(pg.pg_num)) },
{ "value", base64_encode(json11::Json(json11::Json::object { { "value", base64_encode(json11::Json(json11::Json::object {
{ "primary", this->osd_num }, { "primary", this->osd_num },
{ "state", pg_state_keywords }, { "state", pg_state_keywords },
@ -765,26 +645,26 @@ void osd_t::report_pg_states()
}); });
if (pg.history_changed) if (pg.history_changed)
{ {
// Prevent race conditions (for the case when the monitor is updating this key at the same time)
pg.history_changed = false; pg.history_changed = false;
std::string history_key = base64_encode(st_cli.etcd_prefix+"/pg/history/"+std::to_string(pg.pool_id)+"/"+std::to_string(pg.pg_num)); if (pg.state == PG_ACTIVE)
json11::Json::object history_value = { {
{ "epoch", pg.epoch }, success.push_back(json11::Json::object {
{ "all_peers", pg.all_peers }, { "request_delete_range", json11::Json::object {
{ "osd_sets", pg.target_history }, { "key", base64_encode(st_cli.etcd_prefix+"/pg/history/"+std::to_string(pg.pg_num)) },
}; } }
checks.push_back(json11::Json::object { });
{ "target", "MOD" }, }
{ "key", history_key }, else if (pg.state == (PG_ACTIVE|PG_LEFT_ON_DEAD))
{ "result", "LESS" }, {
{ "mod_revision", st_cli.etcd_watch_revision+1 }, success.push_back(json11::Json::object {
}); { "request_put", json11::Json::object {
success.push_back(json11::Json::object { { "key", base64_encode(st_cli.etcd_prefix+"/pg/history/"+std::to_string(pg.pg_num)) },
{ "request_put", json11::Json::object { { "value", base64_encode(json11::Json(json11::Json::object {
{ "key", history_key }, { "all_peers", pg.all_peers },
{ "value", base64_encode(json11::Json(history_value).dump()) }, }).dump()) },
} } } }
}); });
} }
} }
failure.push_back(json11::Json::object { failure.push_back(json11::Json::object {
@ -794,7 +674,6 @@ void osd_t::report_pg_states()
}); });
} }
pg_state_dirty.clear(); pg_state_dirty.clear();
etcd_reporting_pg_state = true;
st_cli.etcd_txn(json11::Json::object { st_cli.etcd_txn(json11::Json::object {
{ "compare", checks }, { "success", success }, { "failure", failure } { "compare", checks }, { "success", success }, { "failure", failure }
}, ETCD_QUICK_TIMEOUT, [this, reporting_pgs](std::string err, json11::Json data) }, ETCD_QUICK_TIMEOUT, [this, reporting_pgs](std::string err, json11::Json data)
@ -820,23 +699,14 @@ void osd_t::report_pg_states()
if (res["kvs"].array_items().size()) if (res["kvs"].array_items().size())
{ {
auto kv = st_cli.parse_etcd_kv(res["kvs"][0]); auto kv = st_cli.parse_etcd_kv(res["kvs"][0]);
if (kv.key.substr(st_cli.etcd_prefix.length()+10) == st_cli.etcd_prefix+"/pg/state/") pg_num_t pg_num = stoull_full(kv.key.substr(st_cli.etcd_prefix.length()+10));
auto pg_it = pgs.find(pg_num);
if (pg_it != pgs.end() && pg_it->second.state != PG_OFFLINE && pg_it->second.state != PG_STARTING)
{ {
pool_id_t pool_id = 0; // Live PG state update failed
pg_num_t pg_num = 0; printf("Failed to report state of PG %u which is live. Race condition detected, exiting\n", pg_num);
char null_byte = 0; force_stop(1);
sscanf(kv.key.c_str() + st_cli.etcd_prefix.length()+10, "%u/%u%c", &pool_id, &pg_num, &null_byte); return;
if (null_byte == 0)
auto pg_it = pgs.find({ .pool_id = pool_id, .pg_num = pg_num });
if (pg_it != pgs.end() && pg_it->second.state != PG_OFFLINE && pg_it->second.state != PG_STARTING)
// Live PG state update failed
printf("Failed to report state of pool %u PG %u which is live. Race condition detected, exiting\n", pool_id, pg_num);
} }
} }
} }
@ -849,16 +719,13 @@ void osd_t::report_pg_states()
for (auto pp: reporting_pgs) for (auto pp: reporting_pgs)
{ {
auto pg_it = this->pgs.find(pp.first); auto pg_it = this->pgs.find(pp.first);
if (pg_it != this->pgs.end() && if (pg_it != this->pgs.end())
pg_it->second.state == PG_OFFLINE &&
pg_state_dirty.find(pp.first) == pg_state_dirty.end())
{ {
// Forget offline PGs after reporting their state if (pg_it->second.state == PG_OFFLINE)
if (pg_it->second.scheme == POOL_SCHEME_JERASURE)
{ {
use_jerasure(pg_it->second.pg_size, pg_it->second.pg_data_size, false); // Remove offline PGs after reporting their state
} }
} }
} }
// Push other PG state updates, if any // Push other PG state updates, if any

View File

@ -1,12 +1,10 @@
// Copyright (c) Vitaliy Filippov, 2019+
// License: VNPL-1.1 (see for details)
#include "osd.h" #include "osd.h"
#define FLUSH_BATCH 512 #define FLUSH_BATCH 512
void osd_t::submit_pg_flush_ops(pg_t & pg) void osd_t::submit_pg_flush_ops(pg_num_t pg_num)
{ {
pg_t & pg = pgs[pg_num];
pg_flush_batch_t *fb = new pg_flush_batch_t(); pg_flush_batch_t *fb = new pg_flush_batch_t();
pg.flush_batch = fb; pg.flush_batch = fb;
auto it = pg.flush_actions.begin(), prev_it = pg.flush_actions.begin(); auto it = pg.flush_actions.begin(), prev_it = pg.flush_actions.begin();
@ -47,7 +45,7 @@ void osd_t::submit_pg_flush_ops(pg_t & pg)
if (l.second.size() > 0) if (l.second.size() > 0)
{ {
fb->flush_ops++; fb->flush_ops++;
submit_flush_op(pg.pool_id, pg.pg_num, fb, true, l.first, l.second.size(),; submit_flush_op(pg.pg_num, fb, true, l.first, l.second.size(),;
} }
} }
for (auto & l: fb->stable_lists) for (auto & l: fb->stable_lists)
@ -55,15 +53,14 @@ void osd_t::submit_pg_flush_ops(pg_t & pg)
if (l.second.size() > 0) if (l.second.size() > 0)
{ {
fb->flush_ops++; fb->flush_ops++;
submit_flush_op(pg.pool_id, pg.pg_num, fb, false, l.first, l.second.size(),; submit_flush_op(pg.pg_num, fb, false, l.first, l.second.size(),;
} }
} }
} }
void osd_t::handle_flush_op(bool rollback, pool_id_t pool_id, pg_num_t pg_num, pg_flush_batch_t *fb, osd_num_t peer_osd, int retval) void osd_t::handle_flush_op(bool rollback, pg_num_t pg_num, pg_flush_batch_t *fb, osd_num_t peer_osd, int retval)
{ {
pool_pg_num_t pg_id = { .pool_id = pool_id, .pg_num = pg_num }; if (pgs.find(pg_num) == pgs.end() || pgs[pg_num].flush_batch != fb)
if (pgs.find(pg_id) == pgs.end() || pgs[pg_id].flush_batch != fb)
{ {
// Throw the result away // Throw the result away
return; return;
@ -81,12 +78,9 @@ void osd_t::handle_flush_op(bool rollback, pool_id_t pool_id, pg_num_t pg_num, p
} }
else else
{ {
printf("Error while doing flush on OSD %lu: %d (%s)\n", osd_num, retval, strerror(-retval)); printf("Error while doing flush on OSD %lu: %s\n", osd_num, strerror(-retval));
auto fd_it = c_cli.osd_peer_fds.find(peer_osd); assert(c_cli.osd_peer_fds.find(peer_osd) != c_cli.osd_peer_fds.end());
if (fd_it != c_cli.osd_peer_fds.end()) c_cli.stop_client(c_cli.osd_peer_fds[peer_osd]);
return; return;
} }
} }
@ -95,7 +89,7 @@ void osd_t::handle_flush_op(bool rollback, pool_id_t pool_id, pg_num_t pg_num, p
{ {
// This flush batch is done // This flush batch is done
std::vector<osd_op_t*> continue_ops; std::vector<osd_op_t*> continue_ops;
auto & pg =; auto & pg = pgs[pg_num];
auto it = pg.flush_actions.begin(), prev_it = it; auto it = pg.flush_actions.begin(), prev_it = it;
auto erase_start = it; auto erase_start = it;
while (1) while (1)
@ -156,22 +150,22 @@ void osd_t::handle_flush_op(bool rollback, pool_id_t pool_id, pg_num_t pg_num, p
} }
} }
void osd_t::submit_flush_op(pool_id_t pool_id, pg_num_t pg_num, pg_flush_batch_t *fb, bool rollback, osd_num_t peer_osd, int count, obj_ver_id *data) void osd_t::submit_flush_op(pg_num_t pg_num, pg_flush_batch_t *fb, bool rollback, osd_num_t peer_osd, int count, obj_ver_id *data)
{ {
osd_op_t *op = new osd_op_t(); osd_op_t *op = new osd_op_t();
// Copy buffer so it gets freed along with the operation // Copy buffer so it gets freed along with the operation
op->buf = malloc_or_die(sizeof(obj_ver_id) * count); op->buf = malloc(sizeof(obj_ver_id) * count);
memcpy(op->buf, data, sizeof(obj_ver_id) * count); memcpy(op->buf, data, sizeof(obj_ver_id) * count);
if (peer_osd == this->osd_num) if (peer_osd == this->osd_num)
{ {
// local // local
clock_gettime(CLOCK_REALTIME, &op->tv_begin); clock_gettime(CLOCK_REALTIME, &op->tv_begin);
op->bs_op = new blockstore_op_t((blockstore_op_t){ op->bs_op = new blockstore_op_t({
.opcode = (uint64_t)(rollback ? BS_OP_ROLLBACK : BS_OP_STABLE), .opcode = (uint64_t)(rollback ? BS_OP_ROLLBACK : BS_OP_STABLE),
.callback = [this, op, pool_id, pg_num, fb](blockstore_op_t *bs_op) .callback = [this, op, pg_num, fb](blockstore_op_t *bs_op)
{ {
add_bs_subop_stats(op); add_bs_subop_stats(op);
handle_flush_op(bs_op->opcode == BS_OP_ROLLBACK, pool_id, pg_num, fb, this->osd_num, bs_op->retval); handle_flush_op(bs_op->opcode == BS_OP_ROLLBACK, pg_num, fb, this->osd_num, bs_op->retval);
delete op->bs_op; delete op->bs_op;
op->bs_op = NULL; op->bs_op = NULL;
delete op; delete op;
@ -186,21 +180,22 @@ void osd_t::submit_flush_op(pool_id_t pool_id, pg_num_t pg_num, pg_flush_batch_t
// Peer // Peer
int peer_fd = c_cli.osd_peer_fds[peer_osd]; int peer_fd = c_cli.osd_peer_fds[peer_osd];
op->op_type = OSD_OP_OUT; op->op_type = OSD_OP_OUT;
op->iov.push_back(op->buf, count * sizeof(obj_ver_id)); op->send_list.push_back(op->req.buf, OSD_PACKET_SIZE);
op->send_list.push_back(op->buf, count * sizeof(obj_ver_id));
op->peer_fd = peer_fd; op->peer_fd = peer_fd;
op->req = (osd_any_op_t){ op->req = {
.sec_stab = { .sec_stab = {
.header = { .header = {
.id = c_cli.next_subop_id++, .id = c_cli.next_subop_id++,
.opcode = (uint64_t)(rollback ? OSD_OP_SEC_ROLLBACK : OSD_OP_SEC_STABILIZE), .opcode = (uint64_t)(rollback ? OSD_OP_SECONDARY_ROLLBACK : OSD_OP_SECONDARY_STABILIZE),
}, },
.len = count * sizeof(obj_ver_id), .len = count * sizeof(obj_ver_id),
}, },
}; };
op->callback = [this, pool_id, pg_num, fb, peer_osd](osd_op_t *op) op->callback = [this, pg_num, fb, peer_osd](osd_op_t *op)
{ {
handle_flush_op(op->req.hdr.opcode == OSD_OP_SEC_ROLLBACK, pool_id, pg_num, fb, peer_osd, op->reply.hdr.retval); handle_flush_op(op->req.hdr.opcode == OSD_OP_SECONDARY_ROLLBACK, pg_num, fb, peer_osd, op->reply.hdr.retval);
delete op; delete op;
}; };
c_cli.outbox_push(op); c_cli.outbox_push(op);
@ -209,38 +204,34 @@ void osd_t::submit_flush_op(pool_id_t pool_id, pg_num_t pg_num, pg_flush_batch_t
bool osd_t::pick_next_recovery(osd_recovery_op_t &op) bool osd_t::pick_next_recovery(osd_recovery_op_t &op)
{ {
if (!no_recovery) for (auto pg_it = pgs.begin(); pg_it != pgs.end(); pg_it++)
{ {
for (auto pg_it = pgs.begin(); pg_it != pgs.end(); pg_it++) if ((pg_it->second.state & (PG_ACTIVE | PG_HAS_DEGRADED)) == (PG_ACTIVE | PG_HAS_DEGRADED))
{ {
if ((pg_it->second.state & (PG_ACTIVE | PG_HAS_DEGRADED)) == (PG_ACTIVE | PG_HAS_DEGRADED)) for (auto obj_it = pg_it->second.degraded_objects.begin(); obj_it != pg_it->second.degraded_objects.end(); obj_it++)
{ {
for (auto obj_it = pg_it->second.degraded_objects.begin(); obj_it != pg_it->second.degraded_objects.end(); obj_it++) if (recovery_ops.find(obj_it->first) == recovery_ops.end())
{ {
if (recovery_ops.find(obj_it->first) == recovery_ops.end()) op.degraded = true;
{ op.pg_num = pg_it->first;
op.degraded = true; op.oid = obj_it->first;
op.oid = obj_it->first; return true;
return true;
} }
} }
} }
} }
if (!no_rebalance) for (auto pg_it = pgs.begin(); pg_it != pgs.end(); pg_it++)
{ {
for (auto pg_it = pgs.begin(); pg_it != pgs.end(); pg_it++) if ((pg_it->second.state & (PG_ACTIVE | PG_HAS_MISPLACED)) == (PG_ACTIVE | PG_HAS_MISPLACED))
{ {
if ((pg_it->second.state & (PG_ACTIVE | PG_HAS_MISPLACED)) == (PG_ACTIVE | PG_HAS_MISPLACED)) for (auto obj_it = pg_it->second.misplaced_objects.begin(); obj_it != pg_it->second.misplaced_objects.end(); obj_it++)
{ {
for (auto obj_it = pg_it->second.misplaced_objects.begin(); obj_it != pg_it->second.misplaced_objects.end(); obj_it++) if (recovery_ops.find(obj_it->first) == recovery_ops.end())
{ {
if (recovery_ops.find(obj_it->first) == recovery_ops.end()) op.degraded = false;
{ op.pg_num = pg_it->first;
op.degraded = false; op.oid = obj_it->first;
op.oid = obj_it->first; return true;
return true;
} }
} }
} }
@ -252,7 +243,7 @@ void osd_t::submit_recovery_op(osd_recovery_op_t *op)
{ {
op->osd_op = new osd_op_t(); op->osd_op = new osd_op_t();
op->osd_op->op_type = OSD_OP_OUT; op->osd_op->op_type = OSD_OP_OUT;
op->osd_op->req = (osd_any_op_t){ op->osd_op->req = {
.rw = { .rw = {
.header = { .header = {
@ -264,44 +255,24 @@ void osd_t::submit_recovery_op(osd_recovery_op_t *op)
.len = 0, .len = 0,
}, },
}; };
if (log_level > 2)
printf("Submitting recovery operation for %lx:%lx\n", op->oid.inode, op->oid.stripe);
op->osd_op->callback = [this, op](osd_op_t *osd_op) op->osd_op->callback = [this, op](osd_op_t *osd_op)
{ {
// Don't sync the write, it will be synced by our regular sync coroutine
if (osd_op->reply.hdr.retval < 0) if (osd_op->reply.hdr.retval < 0)
{ {
// Error recovering object // Error recovering object
if (osd_op->reply.hdr.retval == -EPIPE) if (osd_op->reply.hdr.retval == -EPIPE)
{ {
// PG is stopped or one of the OSDs is gone, error is harmless // PG is stopped or one of the OSDs is gone, error is harmless
"Recovery operation failed with object %lx:%lx (PG %u/%u)\n",
op->oid.inode, op->oid.stripe, INODE_POOL(op->oid.inode),
} }
else else
{ {
throw std::runtime_error("Failed to recover an object"); throw std::runtime_error("Failed to recover an object");
} }
} }
// CAREFUL! op = &recovery_ops[op->oid]. Don't access op->* after recovery_ops.erase()
op->osd_op = NULL;
recovery_ops.erase(op->oid); recovery_ops.erase(op->oid);
delete osd_op; delete osd_op;
if (immediate_commit != IMMEDIATE_ALL) op->osd_op = NULL;
if (recovery_done >= recovery_sync_batch)
// Force sync every <recovery_sync_batch> operations
// This is required not to pile up an excessive amount of delete operations
recovery_done = 0;
continue_recovery(); continue_recovery();
}; };
exec_op(op->osd_op); exec_op(op->osd_op);

osd_id.h Normal file
View File

@ -0,0 +1,4 @@
#pragma once
typedef uint64_t osd_num_t;
typedef uint32_t pg_num_t;

View File

@ -1,6 +1,3 @@
// Copyright (c) Vitaliy Filippov, 2019+
// License: VNPL-1.1 (see for details)
#include "osd.h" #include "osd.h"
#include <signal.h> #include <signal.h>
@ -21,8 +18,6 @@ static void handle_sigint(int sig)
int main(int narg, char *args[]) int main(int narg, char *args[])
{ {
setvbuf(stdout, NULL, _IONBF, 0);
setvbuf(stderr, NULL, _IONBF, 0);
if (sizeof(osd_any_op_t) > OSD_PACKET_SIZE || if (sizeof(osd_any_op_t) > OSD_PACKET_SIZE ||
sizeof(osd_any_reply_t) > OSD_PACKET_SIZE) sizeof(osd_any_reply_t) > OSD_PACKET_SIZE)
{ {
@ -41,13 +36,16 @@ int main(int narg, char *args[])
signal(SIGINT, handle_sigint); signal(SIGINT, handle_sigint);
signal(SIGTERM, handle_sigint); signal(SIGTERM, handle_sigint);
ring_loop_t *ringloop = new ring_loop_t(512); ring_loop_t *ringloop = new ring_loop_t(512);
osd = new osd_t(config, ringloop); // FIXME: Create Blockstore from on-disk superblock config and check it against the OSD cluster config
blockstore_t *bs = new blockstore_t(config, ringloop);
osd = new osd_t(config, bs, ringloop);
while (1) while (1)
{ {
ringloop->loop(); ringloop->loop();
ringloop->wait(); ringloop->wait();
} }
delete osd; delete osd;
delete bs;
delete ringloop; delete ringloop;
return 0; return 0;
} }

View File

// Copyright (c) Vitaliy Filippov, 2019+
// License: VNPL-1.1 or GNU GPL-2.0+ (see for details)
#pragma once #pragma once
#include "object_id.h" #include "object_id.h"
@ -13,22 +10,20 @@
#define OSD_PACKET_SIZE 0x80 #define OSD_PACKET_SIZE 0x80
// Opcodes // Opcodes
#define OSD_OP_MIN 1 #define OSD_OP_MIN 1
#define OSD_OP_SEC_LIST 9 #define OSD_OP_SHOW_CONFIG 9
#define OSD_OP_SHOW_CONFIG 10 #define OSD_OP_READ 10
#define OSD_OP_READ 11 #define OSD_OP_WRITE 11
#define OSD_OP_WRITE 12 #define OSD_OP_SYNC 12
#define OSD_OP_SYNC 13 #define OSD_OP_DELETE 13
#define OSD_OP_DELETE 14 #define OSD_OP_MAX 13
#define OSD_OP_PING 15
#define OSD_OP_MAX 15
// Alignment & limit for read/write operations // Alignment & limit for read/write operations
#define MEM_ALIGNMENT 512 #define MEM_ALIGNMENT 512
@ -71,9 +66,6 @@ struct __attribute__((__packed__)) osd_op_secondary_rw_t
uint32_t offset; uint32_t offset;
// length // length
uint32_t len; uint32_t len;
// bitmap/attribute length - bitmap comes after header, but before data
uint32_t attr_len;
uint32_t pad0;
}; };
struct __attribute__((__packed__)) osd_reply_secondary_rw_t struct __attribute__((__packed__)) osd_reply_secondary_rw_t
@ -81,9 +73,6 @@ struct __attribute__((__packed__)) osd_reply_secondary_rw_t
osd_reply_header_t header; osd_reply_header_t header;
// for reads and writes: assigned or read version number // for reads and writes: assigned or read version number
uint64_t version; uint64_t version;
// for reads: bitmap/attribute length (just to double-check)
uint32_t attr_len;
uint32_t pad0;
}; };
// delete object on the secondary OSD // delete object on the secondary OSD
@ -145,10 +134,7 @@ struct __attribute__((__packed__)) osd_op_secondary_list_t
osd_op_header_t header; osd_op_header_t header;
// placement group total number and total count // placement group total number and total count
pg_num_t list_pg, pg_count; pg_num_t list_pg, pg_count;
// size of an area that maps to one PG continuously
uint64_t pg_stripe_size; uint64_t pg_stripe_size;
// inode range (used to select pools)
uint64_t min_inode, max_inode;
}; };
struct __attribute__((__packed__)) osd_reply_secondary_list_t struct __attribute__((__packed__)) osd_reply_secondary_list_t
@ -160,6 +146,7 @@ struct __attribute__((__packed__)) osd_reply_secondary_list_t
}; };
// read or write to the primary OSD (must be within individual stripe) // read or write to the primary OSD (must be within individual stripe)
// FIXME: allow to return used block bitmap (required for snapshots)
struct __attribute__((__packed__)) osd_op_rw_t struct __attribute__((__packed__)) osd_op_rw_t
{ {
osd_op_header_t header; osd_op_header_t header;
@ -174,9 +161,6 @@ struct __attribute__((__packed__)) osd_op_rw_t
struct __attribute__((__packed__)) osd_reply_rw_t struct __attribute__((__packed__)) osd_reply_rw_t
{ {
osd_reply_header_t header; osd_reply_header_t header;
// for reads: bitmap length
uint32_t bitmap_len;
uint32_t pad0;
}; };
// sync to the primary OSD // sync to the primary OSD
@ -218,5 +202,3 @@ union osd_any_reply_t
osd_reply_sync_t sync; osd_reply_sync_t sync;
uint8_t buf[OSD_PACKET_SIZE]; uint8_t buf[OSD_PACKET_SIZE];
}; };
extern const char* osd_op_names[];

@ -1,6 +1,3 @@
// Copyright (c) Vitaliy Filippov, 2019+
// License: VNPL-1.1 (see for details)
#include <netinet/tcp.h> #include <netinet/tcp.h>
#include <sys/epoll.h> #include <sys/epoll.h>
@ -29,7 +26,7 @@ void osd_t::handle_peers()
degraded_objects += p.second.degraded_objects.size(); degraded_objects += p.second.degraded_objects.size();
if ((p.second.state & (PG_ACTIVE | PG_HAS_UNCLEAN)) == (PG_ACTIVE | PG_HAS_UNCLEAN)) if ((p.second.state & (PG_ACTIVE | PG_HAS_UNCLEAN)) == (PG_ACTIVE | PG_HAS_UNCLEAN))
peering_state = peering_state | OSD_FLUSHING_PGS; peering_state = peering_state | OSD_FLUSHING_PGS;
else if (p.second.state & PG_ACTIVE) else
peering_state = peering_state | OSD_RECOVERING; peering_state = peering_state | OSD_RECOVERING;
} }
else else
@ -53,7 +50,7 @@ void osd_t::handle_peers()
{ {
if (!p.second.flush_batch) if (!p.second.flush_batch)
{ {
submit_pg_flush_ops(p.second); submit_pg_flush_ops(p.first);
} }
still = true; still = true;
} }
@ -91,20 +88,23 @@ void osd_t::repeer_pgs(osd_num_t peer_osd)
if (repeer) if (repeer)
{ {
// Repeer this pg // Repeer this pg
printf("[PG %u/%u] Repeer because of OSD %lu\n", p.second.pool_id, p.second.pg_num, peer_osd); printf("[PG %u] Repeer because of OSD %lu\n", p.second.pg_num, peer_osd);
start_pg_peering(p.second); start_pg_peering(p.second.pg_num);
} }
} }
} }
} }
// Reset PG state (when peering or stopping) // Repeer on each connect/disconnect peer event
void osd_t::reset_pg(pg_t & pg) void osd_t::start_pg_peering(pg_num_t pg_num)
{ {
auto & pg = pgs[pg_num];
pg.state = PG_PEERING;
this->peering_state |= OSD_PEERING_PGS;
// Reset PG state
pg.cur_peers.clear(); pg.cur_peers.clear();
pg.state_dict.clear(); pg.state_dict.clear();
copies_to_delete_after_sync_count -= pg.copies_to_delete_after_sync.size();
incomplete_objects -= pg.incomplete_objects.size(); incomplete_objects -= pg.incomplete_objects.size();
misplaced_objects -= pg.misplaced_objects.size(); misplaced_objects -= pg.misplaced_objects.size();
degraded_objects -= pg.degraded_objects.size(); degraded_objects -= pg.degraded_objects.size();
@ -120,44 +120,20 @@ void osd_t::reset_pg(pg_t & pg)
pg.flush_batch = NULL; pg.flush_batch = NULL;
for (auto p: pg.write_queue) for (auto p: pg.write_queue)
{ {
cancel_primary_write(p.second); finish_op(p.second, -EPIPE);
} }
pg.write_queue.clear(); pg.write_queue.clear();
uint64_t pg_stripe_size = st_cli.pool_config[pg.pool_id].pg_stripe_size;
for (auto it = unstable_writes.begin(); it != unstable_writes.end(); ) for (auto it = unstable_writes.begin(); it != unstable_writes.end(); )
{ {
// Forget this PG's unstable writes // Forget this PG's unstable writes
if (INODE_POOL(it->first.oid.inode) == pg.pool_id && map_to_pg(it->first.oid, pg_stripe_size) == pg.pg_num) pg_num_t n = (it->first.oid.inode + it->first.oid.stripe / pg_stripe_size) % pg_count + 1;
if (n == pg.pg_num)
unstable_writes.erase(it++); unstable_writes.erase(it++);
else else
it++; it++;
} }
dirty_pgs.erase({ .pool_id = pg.pool_id, .pg_num = pg.pg_num }); pg.inflight = 0;
} dirty_pgs.erase(pg.pg_num);
// Repeer on each connect/disconnect peer event
void osd_t::start_pg_peering(pg_t & pg)
pg.state = PG_PEERING;
this->peering_state |= OSD_PEERING_PGS;
// Drop connections of clients who have this PG in dirty_pgs
if (immediate_commit != IMMEDIATE_ALL)
std::vector<int> to_stop;
for (auto & cp: c_cli.clients)
if (cp.second->dirty_pgs.find({ .pool_id = pg.pool_id, .pg_num = pg.pg_num }) != cp.second->dirty_pgs.end())
for (auto peer_fd: to_stop)
// Calculate current write OSD set // Calculate current write OSD set
pg.pg_cursize = 0; pg.pg_cursize = 0;
pg.cur_set.resize(pg.target_set.size()); pg.cur_set.resize(pg.target_set.size());
@ -182,25 +158,19 @@ void osd_t::start_pg_peering(pg_t & pg)
// (PG history is kept up to the latest active+clean state) // (PG history is kept up to the latest active+clean state)
for (auto & history_set: pg.target_history) for (auto & history_set: pg.target_history)
{ {
bool found = true; bool found = false;
for (auto history_osd: history_set) for (auto history_osd: history_set)
{ {
if (history_osd != 0) if (history_osd != 0 && c_cli.osd_peer_fds.find(history_osd) != c_cli.osd_peer_fds.end())
{ {
found = false; found = true;
if (history_osd == this->osd_num || break;
c_cli.osd_peer_fds.find(history_osd) != c_cli.osd_peer_fds.end())
found = true;
} }
} }
if (!found) if (!found)
{ {
pg.state = PG_INCOMPLETE; pg.state = PG_INCOMPLETE;
report_pg_state(pg); report_pg_state(pg);
} }
} }
} }
@ -208,7 +178,6 @@ void osd_t::start_pg_peering(pg_t & pg)
{ {
pg.state = PG_INCOMPLETE; pg.state = PG_INCOMPLETE;
report_pg_state(pg); report_pg_state(pg);
} }
std::set<osd_num_t> cur_peers; std::set<osd_num_t> cur_peers;
for (auto pg_osd: pg.all_peers) for (auto pg_osd: pg.all_peers)
@ -219,7 +188,7 @@ void osd_t::start_pg_peering(pg_t & pg)
} }
else if (c_cli.wanted_peers.find(pg_osd) == c_cli.wanted_peers.end()) else if (c_cli.wanted_peers.find(pg_osd) == c_cli.wanted_peers.end())
{ {
c_cli.connect_peer(pg_osd, st_cli.peer_states[pg_osd]); c_cli.connect_peer(pg_osd, st_cli.peer_states[pg_osd]["addresses"], st_cli.peer_states[pg_osd]["port"].int64_value());
} }
} }
pg.cur_peers.insert(pg.cur_peers.begin(), cur_peers.begin(), cur_peers.end()); pg.cur_peers.insert(pg.cur_peers.begin(), cur_peers.begin(), cur_peers.end());
@ -232,7 +201,8 @@ void osd_t::start_pg_peering(pg_t & pg)
{ {
// Discard the result after completion, which, chances are, will be unsuccessful // Discard the result after completion, which, chances are, will be unsuccessful
discard_list_subop(it->second); discard_list_subop(it->second);
pg.peering_state->list_ops.erase(it++); pg.peering_state->list_ops.erase(it);
it = pg.peering_state->list_ops.begin();
} }
else else
it++; it++;
@ -245,7 +215,8 @@ void osd_t::start_pg_peering(pg_t & pg)
{ {
free(it->second.buf); free(it->second.buf);
} }
pg.peering_state->list_results.erase(it++); pg.peering_state->list_results.erase(it);
it = pg.peering_state->list_results.begin();
} }
else else
it++; it++;
@ -263,7 +234,6 @@ void osd_t::start_pg_peering(pg_t & pg)
if (!pg.peering_state) if (!pg.peering_state)
{ {
pg.peering_state = new pg_peering_state_t(); pg.peering_state = new pg_peering_state_t();
pg.peering_state->pool_id = pg.pool_id;
pg.peering_state->pg_num = pg.pg_num; pg.peering_state->pg_num = pg.pg_num;
} }
for (osd_num_t peer_osd: cur_peers) for (osd_num_t peer_osd: cur_peers)
@ -318,13 +288,14 @@ void osd_t::submit_sync_and_list_subop(osd_num_t role_osd, pg_peering_state_t *p
auto & cl =[role_osd]); auto & cl =[role_osd]);
osd_op_t *op = new osd_op_t(); osd_op_t *op = new osd_op_t();
op->op_type = OSD_OP_OUT; op->op_type = OSD_OP_OUT;
op->peer_fd = cl->peer_fd; op->send_list.push_back(op->req.buf, OSD_PACKET_SIZE);
op->req = (osd_any_op_t){ op->peer_fd = cl.peer_fd;
op->req = {
.sec_sync = { .sec_sync = {
.header = { .header = {
.id = c_cli.next_subop_id++, .id = c_cli.next_subop_id++,
}, },
}, },
}; };
@ -359,10 +330,8 @@ void osd_t::submit_list_subop(osd_num_t role_osd, pg_peering_state_t *ps)
clock_gettime(CLOCK_REALTIME, &op->tv_begin); clock_gettime(CLOCK_REALTIME, &op->tv_begin);
op->bs_op = new blockstore_op_t(); op->bs_op = new blockstore_op_t();
op->bs_op->opcode = BS_OP_LIST; op->bs_op->opcode = BS_OP_LIST;
op->bs_op->oid.stripe = st_cli.pool_config[ps->pool_id].pg_stripe_size; op->bs_op->oid.stripe = pg_stripe_size;
op->bs_op->oid.inode = ((uint64_t)ps->pool_id << (64 - POOL_ID_BITS)); op->bs_op->len = pg_count;
op->bs_op->version = ((uint64_t)(ps->pool_id+1) << (64 - POOL_ID_BITS)) - 1;
op->bs_op->len = pg_counts[ps->pool_id];
op->bs_op->offset = ps->pg_num-1; op->bs_op->offset = ps->pg_num-1;
op->bs_op->callback = [this, ps, op, role_osd](blockstore_op_t *bs_op) op->bs_op->callback = [this, ps, op, role_osd](blockstore_op_t *bs_op)
{ {
@ -372,8 +341,8 @@ void osd_t::submit_list_subop(osd_num_t role_osd, pg_peering_state_t *ps)
} }
add_bs_subop_stats(op); add_bs_subop_stats(op);
printf( printf(
"[PG %u/%u] Got object list from OSD %lu (local): %d object versions (%lu of them stable)\n", "[PG %u] Got object list from OSD %lu (local): %d object versions (%lu of them stable)\n",
ps->pool_id, ps->pg_num, role_osd, bs_op->retval, bs_op->version ps->pg_num, role_osd, bs_op->retval, bs_op->version
); );
ps->list_results[role_osd] = { ps->list_results[role_osd] = {
.buf = (obj_ver_id*)op->bs_op->buf, .buf = (obj_ver_id*)op->bs_op->buf,
@ -393,19 +362,18 @@ void osd_t::submit_list_subop(osd_num_t role_osd, pg_peering_state_t *ps)
// Peer // Peer
osd_op_t *op = new osd_op_t(); osd_op_t *op = new osd_op_t();
op->op_type = OSD_OP_OUT; op->op_type = OSD_OP_OUT;
op->send_list.push_back(op->req.buf, OSD_PACKET_SIZE);
op->peer_fd = c_cli.osd_peer_fds[role_osd]; op->peer_fd = c_cli.osd_peer_fds[role_osd];
op->req = (osd_any_op_t){ op->req = {
.sec_list = { .sec_list = {
.header = { .header = {
.id = c_cli.next_subop_id++, .id = c_cli.next_subop_id++,
}, },
.list_pg = ps->pg_num, .list_pg = ps->pg_num,
.pg_count = pg_counts[ps->pool_id], .pg_count = pg_count,
.pg_stripe_size = st_cli.pool_config[ps->pool_id].pg_stripe_size, .pg_stripe_size = pg_stripe_size,
.min_inode = ((uint64_t)(ps->pool_id) << (64 - POOL_ID_BITS)),
.max_inode = ((uint64_t)(ps->pool_id+1) << (64 - POOL_ID_BITS)) - 1,
}, },
}; };
op->callback = [this, ps, role_osd](osd_op_t *op) op->callback = [this, ps, role_osd](osd_op_t *op)
@ -419,8 +387,8 @@ void osd_t::submit_list_subop(osd_num_t role_osd, pg_peering_state_t *ps)
return; return;
} }
printf( printf(
"[PG %u/%u] Got object list from OSD %lu: %ld object versions (%lu of them stable)\n", "[PG %u] Got object list from OSD %lu: %ld object versions (%lu of them stable)\n",
ps->pool_id, ps->pg_num, role_osd, op->reply.hdr.retval, op->reply.sec_list.stable_count ps->pg_num, role_osd, op->reply.hdr.retval, op->reply.sec_list.stable_count
); );
ps->list_results[role_osd] = { ps->list_results[role_osd] = {
.buf = (obj_ver_id*)op->buf, .buf = (obj_ver_id*)op->buf,
@ -461,16 +429,22 @@ void osd_t::discard_list_subop(osd_op_t *list_op)
} }
} }
bool osd_t::stop_pg(pg_t & pg) bool osd_t::stop_pg(pg_num_t pg_num)
{ {
auto pg_it = pgs.find(pg_num);
if (pg_it == pgs.end())
return false;
auto & pg = pg_it->second;
if (pg.peering_state) if (pg.peering_state)
{ {
// Stop peering // Stop peering
for (auto it = pg.peering_state->list_ops.begin(); it != pg.peering_state->list_ops.end(); it++) for (auto it = pg.peering_state->list_ops.begin(); it != pg.peering_state->list_ops.end();)
{ {
discard_list_subop(it->second); discard_list_subop(it->second);
} }
for (auto it = pg.peering_state->list_results.begin(); it != pg.peering_state->list_results.end(); it++) for (auto it = pg.peering_state->list_results.begin(); it != pg.peering_state->list_results.end();)
{ {
if (it->second.buf) if (it->second.buf)
{ {
@ -480,19 +454,12 @@ bool osd_t::stop_pg(pg_t & pg)
delete pg.peering_state; delete pg.peering_state;
pg.peering_state = NULL; pg.peering_state = NULL;
} }
if (pg.state & (PG_STOPPING | PG_OFFLINE)) if (!(pg.state & PG_ACTIVE))
{ {
return false; return false;
} }
if (!(pg.state & PG_ACTIVE))
return true;
pg.state = pg.state & ~PG_ACTIVE | PG_STOPPING; pg.state = pg.state & ~PG_ACTIVE | PG_STOPPING;
if (pg.inflight == 0 && !pg.flush_batch && if (pg.inflight == 0 && !pg.flush_batch)
// We must either forget all PG's unstable writes or wait for it to become clean
dirty_pgs.find({ .pool_id = pg.pool_id, .pg_num = pg.pg_num }) == dirty_pgs.end())
{ {
finish_stop_pg(pg); finish_stop_pg(pg);
} }
@ -506,14 +473,13 @@ bool osd_t::stop_pg(pg_t & pg)
void osd_t::finish_stop_pg(pg_t & pg) void osd_t::finish_stop_pg(pg_t & pg)
{ {
pg.state = PG_OFFLINE; pg.state = PG_OFFLINE;
report_pg_state(pg); report_pg_state(pg);
} }
void osd_t::report_pg_state(pg_t & pg) void osd_t::report_pg_state(pg_t & pg)
{ {
pg.print_state(); pg.print_state();
this->pg_state_dirty.insert({ .pool_id = pg.pool_id, .pg_num = pg.pg_num }); this->pg_state_dirty.insert(pg.pg_num);
if (pg.state == PG_ACTIVE && (pg.target_history.size() > 0 || pg.all_peers.size() > pg.target_set.size())) if (pg.state == PG_ACTIVE && (pg.target_history.size() > 0 || pg.all_peers.size() > pg.target_set.size()))
{ {
// Clear history of active+clean PGs // Clear history of active+clean PGs

@ -1,7 +1,3 @@
// License: VNPL-1.1 (see for details)
#include <unordered_map>
#include <unordered_map>
#include "osd_peering_pg.h" #include "osd_peering_pg.h"
struct obj_ver_role struct obj_ver_role
@ -37,7 +33,6 @@ struct obj_piece_ver_t
struct pg_obj_state_check_t struct pg_obj_state_check_t
{ {
pg_t *pg; pg_t *pg;
bool replicated = false;
std::vector<obj_ver_role> list; std::vector<obj_ver_role> list;
int list_pos; int list_pos;
int obj_start = 0, obj_end = 0, ver_start = 0, ver_end = 0; int obj_start = 0, obj_end = 0, ver_start = 0, ver_end = 0;
@ -46,7 +41,7 @@ struct pg_obj_state_check_t
uint64_t last_ver = 0; uint64_t last_ver = 0;
uint64_t target_ver = 0; uint64_t target_ver = 0;
uint64_t n_copies = 0, has_roles = 0, n_roles = 0, n_stable = 0, n_mismatched = 0; uint64_t n_copies = 0, has_roles = 0, n_roles = 0, n_stable = 0, n_mismatched = 0;
uint64_t n_unstable = 0, n_invalid = 0; uint64_t n_unstable = 0, n_buggy = 0;
pg_osd_set_t osd_set; pg_osd_set_t osd_set;
int log_level; int log_level;
@ -78,12 +73,6 @@ void pg_obj_state_check_t::walk()
{ {
finish_object(); finish_object();
} }
if (pg->state & PG_HAS_INVALID)
// Stop PGs with "invalid" objects
if (pg->pg_cursize < pg->pg_size) if (pg->pg_cursize < pg->pg_size)
{ {
pg->state |= PG_DEGRADED; pg->state |= PG_DEGRADED;
@ -103,12 +92,12 @@ void pg_obj_state_check_t::start_object()
target_ver = 0; target_ver = 0;
ver_start = list_pos; ver_start = list_pos;
has_roles = n_copies = n_roles = n_stable = n_mismatched = 0; has_roles = n_copies = n_roles = n_stable = n_mismatched = 0;
n_unstable = n_invalid = 0; n_unstable = n_buggy = 0;
} }
void pg_obj_state_check_t::handle_version() void pg_obj_state_check_t::handle_version()
{ {
if (!target_ver && last_ver != list[list_pos].version && (n_stable > 0 || n_roles >= pg->pg_data_size)) if (!target_ver && last_ver != list[list_pos].version && (n_stable > 0 || n_roles >= pg->pg_minsize))
{ {
// Version is either stable or recoverable // Version is either stable or recoverable
target_ver = last_ver; target_ver = last_ver;
@ -122,11 +111,11 @@ void pg_obj_state_check_t::handle_version()
has_roles = n_copies = n_roles = n_stable = n_mismatched = 0; has_roles = n_copies = n_roles = n_stable = n_mismatched = 0;
last_ver = list[list_pos].version; last_ver = list[list_pos].version;
} }
unsigned replica = (list[list_pos].oid.stripe & STRIPE_MASK); int replica = (list[list_pos].oid.stripe & STRIPE_MASK);
n_copies++; n_copies++;
if (replicated && replica > 0 || replica >= pg->pg_size) if (replica >= pg->pg_size)
{ {
n_invalid++; n_buggy++;
} }
else else
{ {
@ -134,32 +123,14 @@ void pg_obj_state_check_t::handle_version()
{ {
n_stable++; n_stable++;
} }
if (replicated) if (pg->cur_set[replica] != list[list_pos].osd_num)
{ {
int i; n_mismatched++;
for (i = 0; i < pg->cur_set.size(); i++)
if (pg->cur_set[i] == list[list_pos].osd_num)
if (i == pg->cur_set.size())
} }
else if (!(has_roles & (1 << replica)))
{ {
if (pg->cur_set[replica] != list[list_pos].osd_num) has_roles = has_roles | (1 << replica);
{ n_roles++;
if (!(has_roles & (1 << replica)))
has_roles = has_roles | (1 << replica);
} }
} }
} }
@ -171,7 +142,7 @@ void pg_obj_state_check_t::handle_version()
void pg_obj_state_check_t::finish_object() void pg_obj_state_check_t::finish_object()
{ {
if (!target_ver && (n_stable > 0 || n_roles >= pg->pg_data_size)) if (!target_ver && (n_stable > 0 || n_roles >= pg->pg_minsize))
{ {
// Version is either stable or recoverable // Version is either stable or recoverable
target_ver = last_ver; target_ver = last_ver;
@ -180,14 +151,11 @@ void pg_obj_state_check_t::finish_object()
obj_end = list_pos; obj_end = list_pos;
// Remember the decision // Remember the decision
uint64_t state = 0; uint64_t state = 0;
if (n_invalid > 0) if (n_buggy > 0)
{ {
// It's not allowed to change the replication scheme for a pool other than by recreating it state = OBJ_BUGGY;
// So we must bring the PG offline // FIXME: bring pg offline
state = OBJ_INCOMPLETE; throw std::runtime_error("buggy object state");
pg->state |= PG_HAS_INVALID;
} }
if (n_unstable > 0) if (n_unstable > 0)
{ {
@ -233,42 +201,51 @@ void pg_obj_state_check_t::finish_object()
{ {
return; return;
} }
if (!replicated && n_roles < pg->pg_data_size) if (n_roles < pg->pg_minsize)
{ {
if (log_level > 1) if (log_level > 1)
{ {
printf("Object is incomplete: %lx:%lx version=%lu/%lu\n", oid.inode, oid.stripe, target_ver, max_ver); printf("Object is incomplete: inode=%lu stripe=%lu version=%lu/%lu\n", oid.inode, oid.stripe, target_ver, max_ver);
for (int i = ver_start; i < ver_end; i++)
printf("Present on: osd %lu, role %ld%s\n", list[i].osd_num, (list[i].oid.stripe & STRIPE_MASK), list[i].is_stable ? " (stable)" : "");
if (log_level > 2)
for (int i = obj_start; i < obj_end; i++)
printf("v%lu present on: osd %lu, role %ld%s\n", list[i].version, list[i].osd_num, (list[i].oid.stripe & STRIPE_MASK), list[i].is_stable ? " (stable)" : "");
} }
pg->state = pg->state | PG_HAS_INCOMPLETE; pg->state = pg->state | PG_HAS_INCOMPLETE;
} }
else if ((replicated ? n_copies : n_roles) < pg->pg_cursize) else if (n_roles < pg->pg_cursize)
{ {
if (log_level > 1) if (log_level > 1)
{ {
printf("Object is degraded: %lx:%lx version=%lu/%lu\n", oid.inode, oid.stripe, target_ver, max_ver); printf("Object is degraded: inode=%lu stripe=%lu version=%lu/%lu\n", oid.inode, oid.stripe, target_ver, max_ver);
for (int i = ver_start; i < ver_end; i++)
printf("Present on: osd %lu, role %ld%s\n", list[i].osd_num, (list[i].oid.stripe & STRIPE_MASK), list[i].is_stable ? " (stable)" : "");
if (log_level > 2)
for (int i = obj_start; i < obj_end; i++)
printf("v%lu present on: osd %lu, role %ld%s\n", list[i].version, list[i].osd_num, (list[i].oid.stripe & STRIPE_MASK), list[i].is_stable ? " (stable)" : "");
} }
pg->state = pg->state | PG_HAS_DEGRADED; pg->state = pg->state | PG_HAS_DEGRADED;
} }
else if (n_mismatched > 0) if (n_mismatched > 0)
{ {
if (log_level > 2 && (replicated || n_roles >= pg->pg_cursize))
printf("Object is misplaced: %lx:%lx version=%lu/%lu\n", oid.inode, oid.stripe, target_ver, max_ver);
pg->state = pg->state | PG_HAS_MISPLACED; pg->state = pg->state | PG_HAS_MISPLACED;
} }
if (log_level > 1 && (state & (OBJ_INCOMPLETE | OBJ_DEGRADED)) ||
log_level > 2 && (state & OBJ_MISPLACED))
for (int i = obj_start; i < obj_end; i++)
printf("v%lu present on: osd %lu, role %ld%s\n", list[i].version, list[i].osd_num,
(list[i].oid.stripe & STRIPE_MASK), list[i].is_stable ? " (stable)" : "");
pg->total_count++; pg->total_count++;
if (state != 0 || ver_end < obj_end) if (state != 0 || ver_end < obj_end)
{ {
@ -302,11 +279,8 @@ void pg_obj_state_check_t::finish_object()
.osd_num = list[i].osd_num, .osd_num = list[i].osd_num,
.outdated = true, .outdated = true,
}); });
{ pg->state = pg->state | PG_HAS_MISPLACED;
pg->state = pg->state | PG_HAS_MISPLACED;
} }
} }
} }
@ -324,34 +298,16 @@ void pg_obj_state_check_t::finish_object()
if (it == pg->state_dict.end()) if (it == pg->state_dict.end())
{ {
std::vector<uint64_t> read_target; std::vector<uint64_t> read_target;
if (replicated) read_target.resize(pg->pg_size);
for (int i = 0; i < pg->pg_size; i++)
{ {
for (auto & o: osd_set) read_target[i] = 0;
if (!o.outdated)
while (read_target.size() < pg->pg_size)
// FIXME: This is because we then use .data() and assume it's at least <pg_size> long
} }
else for (auto & o: osd_set)
{ {
read_target.resize(pg->pg_size); if (!o.outdated)
for (int i = 0; i < pg->pg_size; i++)
{ {
read_target[i] = 0; read_target[o.role] = o.osd_num;
for (auto & o: osd_set)
if (!o.outdated)
read_target[o.role] = o.osd_num;
} }
} }
pg->state_dict[osd_set] = { pg->state_dict[osd_set] = {
@ -388,9 +344,7 @@ void pg_t::calc_object_states(int log_level)
pg_obj_state_check_t st; pg_obj_state_check_t st;
st.log_level = log_level; st.log_level = log_level; = this; = this;
st.replicated = (this->scheme == POOL_SCHEME_REPLICATED);
auto ps = peering_state; auto ps = peering_state;
epoch = 0;
for (auto it: ps->list_results) for (auto it: ps->list_results)
{ {
auto nstab = it.second.stable_count; auto nstab = it.second.stable_count;
@ -401,10 +355,6 @@ void pg_t::calc_object_states(int log_level)
obj_ver_id *ov = it.second.buf; obj_ver_id *ov = it.second.buf;
for (uint64_t i = 0; i < n; i++, ov++) for (uint64_t i = 0; i < n; i++, ov++)
{ {
if ((ov->version >> (64-PG_EPOCH_BITS)) > epoch)
epoch = (ov->version >> (64-PG_EPOCH_BITS));
st.list[start+i] = { st.list[start+i] = {
.oid = ov->oid, .oid = ov->oid,
.version = ov->version, .version = ov->version,
@ -420,17 +370,12 @@ void pg_t::calc_object_states(int log_level)
std::sort(st.list.begin(), st.list.end()); std::sort(st.list.begin(), st.list.end());
// Walk over it and check object states // Walk over it and check object states
st.walk(); st.walk();
if (this->state & (PG_DEGRADED|PG_LEFT_ON_DEAD))
assert(epoch != ((1ul << PG_EPOCH_BITS)-1));
} }
void pg_t::print_state() void pg_t::print_state()
{ {
printf( printf(
"[PG %u/%u] is %s%s%s%s%s%s%s%s%s%s%s%s%s (%lu objects)\n", pool_id, pg_num, "[PG %u] is %s%s%s%s%s%s%s%s%s%s%s (%lu objects)\n", pg_num,
(state & PG_STARTING) ? "starting" : "", (state & PG_STARTING) ? "starting" : "",
(state & PG_OFFLINE) ? "offline" : "", (state & PG_OFFLINE) ? "offline" : "",
(state & PG_PEERING) ? "peering" : "", (state & PG_PEERING) ? "peering" : "",
@ -442,8 +387,6 @@ void pg_t::print_state()
(state & PG_HAS_DEGRADED) ? " + has_degraded" : "", (state & PG_HAS_DEGRADED) ? " + has_degraded" : "",
(state & PG_HAS_MISPLACED) ? " + has_misplaced" : "", (state & PG_HAS_MISPLACED) ? " + has_misplaced" : "",
(state & PG_HAS_UNCLEAN) ? " + has_unclean" : "", (state & PG_HAS_UNCLEAN) ? " + has_unclean" : "",
(state & PG_HAS_INVALID) ? " + has_invalid" : "",
(state & PG_LEFT_ON_DEAD) ? " + left_on_dead" : "",
total_count total_count
); );
} }

View File

@ -1,7 +1,5 @@
// License: VNPL-1.1 (see for details)
// License: VNPL-1.1 (see for details)
#include <map> #include <map>
#include <unordered_map>
#include <vector> #include <vector>
#include <algorithm> #include <algorithm>
@ -11,8 +9,6 @@
#include "osd_ops.h" #include "osd_ops.h"
#include "pg_states.h" #include "pg_states.h"
#define PG_EPOCH_BITS 48
struct pg_obj_loc_t struct pg_obj_loc_t
{ {
uint64_t role; uint64_t role;
@ -44,9 +40,8 @@ struct osd_op_t;
struct pg_peering_state_t struct pg_peering_state_t
{ {
// osd_num -> list result // osd_num -> list result
std::map<osd_num_t, osd_op_t*> list_ops; std::unordered_map<osd_num_t, osd_op_t*> list_ops;
std::map<osd_num_t, pg_list_result_t> list_results; std::unordered_map<osd_num_t, pg_list_result_t> list_results;
pool_id_t pool_id = 0;
pg_num_t pg_num = 0; pg_num_t pg_num = 0;
}; };
@ -56,13 +51,6 @@ struct obj_piece_id_t
uint64_t osd_num; uint64_t osd_num;
}; };
struct obj_ver_osd_t
uint64_t osd_num;
object_id oid;
uint64_t version;
struct flush_action_t struct flush_action_t
{ {
bool rollback = false, make_stable = false; bool rollback = false, make_stable = false;
@ -81,13 +69,9 @@ struct pg_flush_batch_t
struct pg_t struct pg_t
{ {
int state = 0; int state = 0;
uint64_t scheme = 0; uint64_t pg_cursize = 3, pg_size = 3, pg_minsize = 2;
uint64_t pg_cursize = 0, pg_size = 0, pg_minsize = 0, pg_data_size = 0; pg_num_t pg_num;
pool_id_t pool_id = 0;
pg_num_t pg_num = 0;
uint64_t clean_count = 0, total_count = 0; uint64_t clean_count = 0, total_count = 0;
// epoch number - should increase with each non-clean activation of the PG
uint64_t epoch = 0, reported_epoch = 0;
// target history and all potential peers // target history and all potential peers
std::vector<std::vector<osd_num_t>> target_history; std::vector<std::vector<osd_num_t>> target_history;
std::vector<osd_num_t> all_peers; std::vector<osd_num_t> all_peers;
@ -101,14 +85,13 @@ struct pg_t
std::vector<osd_num_t> cur_set; std::vector<osd_num_t> cur_set;
// same thing in state_dict-like format // same thing in state_dict-like format
pg_osd_set_t cur_loc_set; pg_osd_set_t cur_loc_set;
// moved object map. by default, each object is considered to reside on cur_set. // moved object map. by default, each object is considered to reside on the cur_set.
// this map stores all objects that differ. // this map stores all objects that differ.
// it may consume up to ~ (raw storage / object size) * 24 bytes in the worst case scenario // it may consume up to ~ (raw storage / object size) * 24 bytes in the worst case scenario
// which is up to ~192 MB per 1 TB in the worst case scenario // which is up to ~192 MB per 1 TB in the worst case scenario
std::map<pg_osd_set_t, pg_osd_set_state_t> state_dict; std::map<pg_osd_set_t, pg_osd_set_state_t> state_dict;
btree::btree_map<object_id, pg_osd_set_state_t*> incomplete_objects, misplaced_objects, degraded_objects; btree::btree_map<object_id, pg_osd_set_state_t*> incomplete_objects, misplaced_objects, degraded_objects;
std::map<obj_piece_id_t, flush_action_t> flush_actions; std::map<obj_piece_id_t, flush_action_t> flush_actions;
std::vector<obj_ver_osd_t> copies_to_delete_after_sync;
btree::btree_map<object_id, uint64_t> ver_override; btree::btree_map<object_id, uint64_t> ver_override;
pg_peering_state_t *peering_state = NULL; pg_peering_state_t *peering_state = NULL;
pg_flush_batch_t *flush_batch = NULL; pg_flush_batch_t *flush_batch = NULL;

@ -1,9 +1,5 @@
// License: VNPL-1.1 (see for details)
// License: VNPL-1.1 (see for details)
#include "malloc_or_die.h"
#include "osd_peering_pg.h" #include "osd_peering_pg.h"
#define STRIPE_SHIFT 12 #define STRIPE_SHIFT 12
@ -32,7 +28,7 @@ int main(int argc, char *argv[])
for (uint64_t osd_num = 1; osd_num <= 3; osd_num++) for (uint64_t osd_num = 1; osd_num <= 3; osd_num++)
{ {
pg_list_result_t r = { pg_list_result_t r = {
.buf = (obj_ver_id*)malloc_or_die(sizeof(obj_ver_id) * 1024*1024*8), .buf = (obj_ver_id*)malloc(sizeof(obj_ver_id) * 1024*1024*8),
.total_count = 1024*1024*8, .total_count = 1024*1024*8,
.stable_count = (uint64_t)(1024*1024*8 - (osd_num == 1 ? 10 : 0)), .stable_count = (uint64_t)(1024*1024*8 - (osd_num == 1 ? 10 : 0)),
}; };

View File

@ -0,0 +1,671 @@
#include "osd_primary.h"
// read: read directly or read paired stripe(s), reconstruct, return
// write: read paired stripe(s), reconstruct, modify, calculate parity, write
// nuance: take care to read the same version from paired stripes!
// to do so, we remember "last readable" version until a write request completes
// and we postpone other write requests to the same stripe until completion of previous ones
// sync: sync peers, get unstable versions, stabilize them
bool osd_t::prepare_primary_rw(osd_op_t *cur_op)
// PG number is calculated from the offset
// Our EC scheme stores data in fixed chunks equal to (K*block size)
// But we must not use K in the process of calculating the PG number
// So we calculate the PG number using a separate setting which should be per-inode (FIXME)
pg_num_t pg_num = (cur_op-> + cur_op-> / pg_stripe_size) % pg_count + 1;
auto pg_it = pgs.find(pg_num);
if (pg_it == pgs.end() || !(pg_it->second.state & PG_ACTIVE))
// This OSD is not primary for this PG or the PG is inactive
finish_op(cur_op, -EPIPE);
return false;
uint64_t pg_block_size = bs_block_size * pg_it->second.pg_minsize;
object_id oid = {
.inode = cur_op->,
// oid.stripe = starting offset of the parity stripe, so it can be mapped back to the PG
.stripe = (cur_op-> / pg_stripe_size) * pg_stripe_size +
((cur_op-> % pg_stripe_size) / pg_block_size) * pg_block_size
if ((cur_op-> + cur_op-> > (oid.stripe + pg_block_size) ||
(cur_op-> % bs_disk_alignment) != 0 ||
(cur_op-> % bs_disk_alignment) != 0)
finish_op(cur_op, -EINVAL);
return false;
osd_primary_op_data_t *op_data = (osd_primary_op_data_t*)calloc(
sizeof(osd_primary_op_data_t) + sizeof(osd_rmw_stripe_t) * pg_it->second.pg_size, 1
op_data->pg_num = pg_num;
op_data->oid = oid;
op_data->stripes = ((osd_rmw_stripe_t*)(op_data+1));
cur_op->op_data = op_data;
split_stripes(pg_it->second.pg_minsize, bs_block_size, (uint32_t)(cur_op-> - oid.stripe), cur_op->, op_data->stripes);
return true;
static uint64_t* get_object_osd_set(pg_t &pg, object_id &oid, uint64_t *def, pg_osd_set_state_t **object_state)
*object_state = NULL;
return def;
auto st_it = pg.incomplete_objects.find(oid);
if (st_it != pg.incomplete_objects.end())
*object_state = st_it->second;
return st_it->second->;
st_it = pg.degraded_objects.find(oid);
if (st_it != pg.degraded_objects.end())
*object_state = st_it->second;
return st_it->second->;
st_it = pg.misplaced_objects.find(oid);
if (st_it != pg.misplaced_objects.end())
*object_state = st_it->second;
return st_it->second->;
*object_state = NULL;
return def;
void osd_t::continue_primary_read(osd_op_t *cur_op)
if (!cur_op->op_data && !prepare_primary_rw(cur_op))
osd_primary_op_data_t *op_data = cur_op->op_data;
if (op_data->st == 1) goto resume_1;
else if (op_data->st == 2) goto resume_2;
auto & pg = pgs[op_data->pg_num];
for (int role = 0; role < pg.pg_minsize; role++)
op_data->stripes[role].read_start = op_data->stripes[role].req_start;
op_data->stripes[role].read_end = op_data->stripes[role].req_end;
// Determine version
auto vo_it = pg.ver_override.find(op_data->oid);
op_data->target_ver = vo_it != pg.ver_override.end() ? vo_it->second : UINT64_MAX;
if (pg.state == PG_ACTIVE)
// Fast happy-path
cur_op->buf = alloc_read_buffer(op_data->stripes, pg.pg_minsize, 0);
submit_primary_subops(SUBMIT_READ, pg.pg_minsize,, cur_op);
cur_op->send_list.push_back(cur_op->buf, cur_op->;
op_data->st = 1;
// PG may be degraded or have misplaced objects
uint64_t* cur_set = get_object_osd_set(pg, op_data->oid,, &op_data->object_state);
if (extend_missing_stripes(op_data->stripes, cur_set, pg.pg_minsize, pg.pg_size) < 0)
finish_op(cur_op, -EIO);
// Submit reads
op_data->pg_minsize = pg.pg_minsize;
op_data->pg_size = pg.pg_size;
op_data->degraded = 1;
cur_op->buf = alloc_read_buffer(op_data->stripes, pg.pg_size, 0);
submit_primary_subops(SUBMIT_READ, pg.pg_size, cur_set, cur_op);
op_data->st = 1;
if (op_data->errors > 0)
finish_op(cur_op, op_data->epipe > 0 ? -EPIPE : -EIO);
if (op_data->degraded)
// Reconstruct missing stripes
// FIXME: Always EC(k+1) by now. Add different coding schemes
osd_rmw_stripe_t *stripes = op_data->stripes;
for (int role = 0; role < op_data->pg_minsize; role++)
if (stripes[role].read_end != 0 && stripes[role].missing)
reconstruct_stripe(stripes, op_data->pg_size, role);
if (stripes[role].req_end != 0)
// Send buffer in parts to avoid copying
stripes[role].read_buf + (stripes[role].req_start - stripes[role].read_start),
stripes[role].req_end - stripes[role].req_start
finish_op(cur_op, cur_op->;
bool osd_t::check_write_queue(osd_op_t *cur_op, pg_t & pg)
osd_primary_op_data_t *op_data = cur_op->op_data;
// Check if actions are pending for this object
auto act_it = pg.flush_actions.lower_bound((obj_piece_id_t){
.oid = op_data->oid,
.osd_num = 0,
if (act_it != pg.flush_actions.end() &&
act_it->first.oid.inode == op_data->oid.inode &&
(act_it->first.oid.stripe & ~STRIPE_MASK) == op_data->oid.stripe)
pg.write_queue.emplace(op_data->oid, cur_op);
return false;
// Check if there are other write requests to the same object
auto vo_it = pg.write_queue.find(op_data->oid);
if (vo_it != pg.write_queue.end())
op_data->st = 1;
pg.write_queue.emplace(op_data->oid, cur_op);
return false;
pg.write_queue.emplace(op_data->oid, cur_op);
return true;
void osd_t::continue_primary_write(osd_op_t *cur_op)
if (!cur_op->op_data && !prepare_primary_rw(cur_op))
osd_primary_op_data_t *op_data = cur_op->op_data;
auto & pg = pgs[op_data->pg_num];
if (op_data->st == 1) goto resume_1;
else if (op_data->st == 2) goto resume_2;
else if (op_data->st == 3) goto resume_3;
else if (op_data->st == 4) goto resume_4;
else if (op_data->st == 5) goto resume_5;
else if (op_data->st == 6) goto resume_6;
else if (op_data->st == 7) goto resume_7;
else if (op_data->st == 8) goto resume_8;
assert(op_data->st == 0);
if (!check_write_queue(cur_op, pg))
// Determine blocks to read and write
// Missing chunks are allowed to be overwritten even in incomplete objects
// FIXME: Allow to do small writes to the old (degraded/misplaced) OSD set for the lower performance impact
op_data->prev_set = get_object_osd_set(pg, op_data->oid,, &op_data->object_state);
cur_op->rmw_buf = calc_rmw(cur_op->buf, op_data->stripes, op_data->prev_set,
pg.pg_size, pg.pg_minsize, pg.pg_cursize,, bs_block_size);
// Read required blocks
submit_primary_subops(SUBMIT_RMW_READ, pg.pg_size, op_data->prev_set, cur_op);
op_data->st = 2;
if (op_data->errors > 0)
pg_cancel_write_queue(pg, op_data->oid, op_data->epipe > 0 ? -EPIPE : -EIO);
// Save version override for parallel reads
pg.ver_override[op_data->oid] = op_data->fact_ver;
// Recover missing stripes, calculate parity
calc_rmw_parity(op_data->stripes, pg.pg_size, op_data->prev_set,, bs_block_size);
// Send writes
submit_primary_subops(SUBMIT_WRITE, pg.pg_size,, cur_op);
op_data->st = 4;
if (op_data->errors > 0)
pg_cancel_write_queue(pg, op_data->oid, op_data->epipe > 0 ? -EPIPE : -EIO);
if (op_data->fact_ver == 1)
// Object is created
if (op_data->object_state)
int recovery_type = op_data->object_state->state & (OBJ_DEGRADED|OBJ_INCOMPLETE) ? 0 : 1;
if (!recovery_stat_count[0][recovery_type])
recovery_stat_bytes[0][recovery_type] = 0;
for (int role = 0; role < pg.pg_size; role++)
recovery_stat_bytes[0][recovery_type] += op_data->stripes[role].write_end - op_data->stripes[role].write_start;
if (op_data->object_state->state & OBJ_MISPLACED)
// Remove extra chunks
submit_primary_del_subops(cur_op,, op_data->object_state->osd_set);
if (op_data->n_subops > 0)
op_data->st = 8;
if (op_data->errors > 0)
pg_cancel_write_queue(pg, op_data->oid, op_data->epipe > 0 ? -EPIPE : -EIO);
// Clear object state
remove_object_from_state(op_data->oid, op_data->object_state, pg);
// Remove version override
// FIXME: Check for immediate_commit == IMMEDIATE_SMALL
if (!finalize_primary_write(cur_op, pg, pg.cur_loc_set, 6))
object_id oid = op_data->oid;
finish_op(cur_op, cur_op->;
// Continue other write operations to the same object
auto next_it = pg.write_queue.find(oid);
auto this_it = next_it;
if (next_it != pg.write_queue.end() &&
next_it->first == oid)
osd_op_t *next_op = next_it->second;
bool osd_t::finalize_primary_write(osd_op_t *cur_op, pg_t & pg, pg_osd_set_t & loc_set, int base_state)
osd_primary_op_data_t *op_data = cur_op->op_data;
if (op_data->st == base_state)
goto resume_6;
else if (op_data->st == base_state+1)
goto resume_7;
if (immediate_commit == IMMEDIATE_ALL)
op_data->unstable_write_osds = new std::vector<unstable_osd_num_t>();
op_data->unstable_writes = new obj_ver_id[loc_set.size()];
int last_start = 0;
for (auto & chunk: loc_set)
op_data->unstable_writes[last_start] = (obj_ver_id){
.oid = {
.inode = op_data->oid.inode,
.stripe = op_data->oid.stripe | chunk.role,
.version = op_data->fact_ver,
.osd_num = chunk.osd_num,
.start = last_start,
.len = 1,
op_data->st = 6;
return false;
// FIXME: Free those in the destructor?
delete op_data->unstable_write_osds;
delete[] op_data->unstable_writes;
op_data->unstable_writes = NULL;
op_data->unstable_write_osds = NULL;
if (op_data->errors > 0)
pg_cancel_write_queue(pg, op_data->oid, op_data->epipe > 0 ? -EPIPE : -EIO);
return false;
// Remember version as unstable
for (auto & chunk: loc_set)
.osd_num = chunk.osd_num,
.oid = {
.inode = op_data->oid.inode,
.stripe = op_data->oid.stripe | chunk.role,
}] = op_data->fact_ver;
// Remember PG as dirty to drop the connection when PG goes offline
// (this is required because of the "lazy sync")
return true;
// Save and clear unstable_writes -> SYNC all -> STABLE all
void osd_t::continue_primary_sync(osd_op_t *cur_op)
if (!cur_op->op_data)
cur_op->op_data = (osd_primary_op_data_t*)calloc(sizeof(osd_primary_op_data_t), 1);
osd_primary_op_data_t *op_data = cur_op->op_data;
if (op_data->st == 1) goto resume_1;
else if (op_data->st == 2) goto resume_2;
else if (op_data->st == 3) goto resume_3;
else if (op_data->st == 4) goto resume_4;
else if (op_data->st == 5) goto resume_5;
else if (op_data->st == 6) goto resume_6;
assert(op_data->st == 0);
if (syncs_in_progress.size() > 0)
// Wait for previous syncs, if any
// FIXME: We may try to execute the current one in parallel, like in Blockstore, but I'm not sure if it matters at all
op_data->st = 1;
if (unstable_writes.size() == 0)
// Nothing to sync
goto finish;
// Save and clear unstable_writes
// In theory it is possible to do in on a per-client basis, but this seems to be an unnecessary complication
// It would be cool not to copy these here at all, but someone has to deduplicate them by object IDs anyway
op_data->unstable_write_osds = new std::vector<unstable_osd_num_t>();
op_data->unstable_writes = new obj_ver_id[this->unstable_writes.size()];
op_data->dirty_pgs = new pg_num_t[dirty_pgs.size()];
op_data->dirty_pg_count = dirty_pgs.size();
osd_num_t last_osd = 0;
int last_start = 0, last_end = 0;
for (auto it = this->unstable_writes.begin(); it != this->unstable_writes.end(); it++)
if (last_osd != it->first.osd_num)
if (last_osd != 0)
.osd_num = last_osd,
.start = last_start,
.len = last_end - last_start,
last_osd = it->first.osd_num;
last_start = last_end;
op_data->unstable_writes[last_end] = (obj_ver_id){
.oid = it->first.oid,
.version = it->second,
if (last_osd != 0)
.osd_num = last_osd,
.start = last_start,
.len = last_end - last_start,
int dpg = 0;
for (auto dirty_pg_num: dirty_pgs)
op_data->dirty_pgs[dpg++] = dirty_pg_num;
if (immediate_commit != IMMEDIATE_ALL)
op_data->st = 3;
if (op_data->errors > 0)
goto resume_6;
// Stabilize version sets
op_data->st = 5;
if (op_data->errors > 0)
// Return objects back into the unstable write set
for (auto unstable_osd: *(op_data->unstable_write_osds))
for (int i = 0; i < unstable_osd.len; i++)
// Except those from peered PGs
auto & w = op_data->unstable_writes[i];
pg_num_t wpg = map_to_pg(w.oid);
if (pgs[wpg].state & PG_ACTIVE)
uint64_t & dest = this->unstable_writes[(osd_object_id_t){
.osd_num = unstable_osd.osd_num,
.oid = w.oid,
dest = dest < w.version ? w.version : dest;
for (int i = 0; i < op_data->dirty_pg_count; i++)
auto & pg =>dirty_pgs[i]);
if ((pg.state & PG_STOPPING) && pg.inflight == 0 && !pg.flush_batch)
// FIXME: Free those in the destructor?
delete op_data->dirty_pgs;
delete op_data->unstable_write_osds;
delete[] op_data->unstable_writes;
op_data->unstable_writes = NULL;
op_data->unstable_write_osds = NULL;
if (op_data->errors > 0)
finish_op(cur_op, op_data->epipe > 0 ? -EPIPE : -EIO);
if (cur_op->peer_fd)
auto it = c_cli.clients.find(cur_op->peer_fd);
if (it != c_cli.clients.end())
finish_op(cur_op, 0);
assert(syncs_in_progress.front() == cur_op);
if (syncs_in_progress.size() > 0)
cur_op = syncs_in_progress.front();
op_data = cur_op->op_data;
goto resume_2;
// Decrement pg_osd_set_state_t's object_count and change PG state accordingly
void osd_t::remove_object_from_state(object_id & oid, pg_osd_set_state_t *object_state, pg_t & pg)
if (object_state->state & OBJ_INCOMPLETE)
// Successful write means that object is not incomplete anymore
if (!pg.incomplete_objects.size())
pg.state = pg.state & ~PG_HAS_INCOMPLETE;
else if (object_state->state & OBJ_DEGRADED)
if (!pg.degraded_objects.size())
pg.state = pg.state & ~PG_HAS_DEGRADED;
else if (object_state->state & OBJ_MISPLACED)
if (!pg.misplaced_objects.size())
pg.state = pg.state & ~PG_HAS_MISPLACED;
throw std::runtime_error("BUG: Invalid object state: "+std::to_string(object_state->state));
if (!object_state->object_count)
void osd_t::continue_primary_del(osd_op_t *cur_op)
if (!cur_op->op_data && !prepare_primary_rw(cur_op))
osd_primary_op_data_t *op_data = cur_op->op_data;
auto & pg = pgs[op_data->pg_num];
if (op_data->st == 1) goto resume_1;
else if (op_data->st == 2) goto resume_2;
else if (op_data->st == 3) goto resume_3;
else if (op_data->st == 4) goto resume_4;
else if (op_data->st == 5) goto resume_5;
else if (op_data->st == 6) goto resume_6;
else if (op_data->st == 7) goto resume_7;
assert(op_data->st == 0);
// Delete is forbidden even in active PGs if they're also degraded or have previous dead OSDs
if (pg.state & (PG_DEGRADED | PG_LEFT_ON_DEAD))
finish_op(cur_op, -EBUSY);
if (!check_write_queue(cur_op, pg))
// Determine which OSDs contain this object and delete it
op_data->prev_set = get_object_osd_set(pg, op_data->oid,, &op_data->object_state);
// Submit 1 read to determine the actual version number
submit_primary_subops(SUBMIT_RMW_READ, pg.pg_size, op_data->prev_set, cur_op);
op_data->st = 2;
if (op_data->errors > 0)
pg_cancel_write_queue(pg, op_data->oid, op_data->epipe > 0 ? -EPIPE : -EIO);
// Save version override for parallel reads
pg.ver_override[op_data->oid] = op_data->fact_ver;
// Submit deletes
submit_primary_del_subops(cur_op, NULL, op_data->object_state ? op_data->object_state->osd_set : pg.cur_loc_set);
op_data->st = 4;
if (op_data->errors > 0)
pg_cancel_write_queue(pg, op_data->oid, op_data->epipe > 0 ? -EPIPE : -EIO);
// Remove version override
if (!finalize_primary_write(cur_op, pg, op_data->object_state ? op_data->object_state->osd_set : pg.cur_loc_set, 6))
// Adjust PG stats after "instant stabilize", because we need object_state above
if (!op_data->object_state)
remove_object_from_state(op_data->oid, op_data->object_state, pg);
object_id oid = op_data->oid;
finish_op(cur_op, cur_op->;
// Continue other write operations to the same object
auto next_it = pg.write_queue.find(oid);
auto this_it = next_it;
if (next_it != pg.write_queue.end() &&
next_it->first == oid)
osd_op_t *next_op = next_it->second;

View File

@ -1,6 +1,3 @@
// License: VNPL-1.1 (see for details)
// License: VNPL-1.1 (see for details)
#pragma once #pragma once
#include "osd.h" #include "osd.h"
@ -23,9 +20,8 @@ struct osd_primary_op_data_t
object_id oid; object_id oid;
uint64_t target_ver; uint64_t target_ver;
uint64_t fact_ver = 0; uint64_t fact_ver = 0;
uint64_t scheme = 0;
int n_subops = 0, done = 0, errors = 0, epipe = 0; int n_subops = 0, done = 0, errors = 0, epipe = 0;
int degraded = 0, pg_size, pg_data_size; int degraded = 0, pg_size, pg_minsize;
osd_rmw_stripe_t *stripes; osd_rmw_stripe_t *stripes;
osd_op_t *subops = NULL; osd_op_t *subops = NULL;
uint64_t *prev_set = NULL; uint64_t *prev_set = NULL;
@ -33,13 +29,7 @@ struct osd_primary_op_data_t
// for sync. oops, requires freeing // for sync. oops, requires freeing
std::vector<unstable_osd_num_t> *unstable_write_osds = NULL; std::vector<unstable_osd_num_t> *unstable_write_osds = NULL;
pool_pg_num_t *dirty_pgs = NULL; pg_num_t *dirty_pgs = NULL;
int dirty_pg_count = 0; int dirty_pg_count = 0;
osd_num_t *dirty_osds = NULL;
int dirty_osd_count = 0;
obj_ver_id *unstable_writes = NULL; obj_ver_id *unstable_writes = NULL;
obj_ver_osd_t *copies_to_delete = NULL;
int copies_to_delete_count = 0;
}; };
bool contains_osd(osd_num_t *osd_set, uint64_t size, osd_num_t osd_num);

View File

@ -0,0 +1,489 @@
#include "osd_primary.h"
void osd_t::autosync()
// FIXME Autosync based on the number of unstable writes to prevent
// "journal_sector_buffer_count is too low for this batch" errors
if (immediate_commit != IMMEDIATE_ALL && !autosync_op)
autosync_op = new osd_op_t();
autosync_op->op_type = OSD_OP_IN;
autosync_op->req = {
.sync = {
.header = {
.id = 1,
.opcode = OSD_OP_SYNC,
autosync_op->callback = [this](osd_op_t *op)
if (op->reply.hdr.retval < 0)
printf("Warning: automatic sync resulted in an error: %ld (%s)\n", -op->reply.hdr.retval, strerror(-op->reply.hdr.retval));
delete autosync_op;
autosync_op = NULL;
void osd_t::finish_op(osd_op_t *cur_op, int retval)
if (cur_op->op_data && cur_op->op_data->pg_num > 0)
auto & pg = pgs[cur_op->op_data->pg_num];
assert(pg.inflight >= 0);
if ((pg.state & PG_STOPPING) && pg.inflight == 0 && !pg.flush_batch)
if (!cur_op->peer_fd)
// Copy lambda to be unaffected by `delete op`
// FIXME add separate magic number
auto cl_it = c_cli.clients.find(cur_op->peer_fd);
if (cl_it != c_cli.clients.end())
cur_op->reply.hdr.magic = SECONDARY_OSD_REPLY_MAGIC;
cur_op-> = cur_op->;
cur_op->reply.hdr.opcode = cur_op->req.hdr.opcode;
cur_op->reply.hdr.retval = retval;
delete cur_op;
void osd_t::submit_primary_subops(int submit_type, int pg_size, const uint64_t* osd_set, osd_op_t *cur_op)
bool w = submit_type == SUBMIT_WRITE;
osd_primary_op_data_t *op_data = cur_op->op_data;
osd_rmw_stripe_t *stripes = op_data->stripes;
// Allocate subops
int n_subops = 0, zero_read = -1;
for (int role = 0; role < pg_size; role++)
if (osd_set[role] == this->osd_num || osd_set[role] != 0 && zero_read == -1)
zero_read = role;
if (osd_set[role] != 0 && (w || stripes[role].read_end != 0))
if (!n_subops && submit_type == SUBMIT_RMW_READ)
n_subops = 1;
zero_read = -1;
uint64_t op_version = w ? op_data->fact_ver+1 : (submit_type == SUBMIT_RMW_READ ? UINT64_MAX : op_data->target_ver);
osd_op_t *subops = new osd_op_t[n_subops];
op_data->fact_ver = 0;
op_data->done = op_data->errors = 0;
op_data->n_subops = n_subops;
op_data->subops = subops;
int i = 0;
for (int role = 0; role < pg_size; role++)
// We always submit zero-length writes to all replicas, even if the stripe is not modified
if (!(w || stripes[role].read_end != 0 || zero_read == role))
osd_num_t role_osd_num = osd_set[role];
if (role_osd_num != 0)
if (role_osd_num == this->osd_num)
clock_gettime(CLOCK_REALTIME, &subops[i].tv_begin);
subops[i].op_type = (uint64_t)cur_op;
subops[i].bs_op = new blockstore_op_t({
.opcode = (uint64_t)(w ? BS_OP_WRITE : BS_OP_READ),
.callback = [subop = &subops[i], this](blockstore_op_t *bs_subop)
.oid = {
.inode = op_data->oid.inode,
.stripe = op_data->oid.stripe | role,
.version = op_version,
.offset = w ? stripes[role].write_start : stripes[role].read_start,
.len = w ? stripes[role].write_end - stripes[role].write_start : stripes[role].read_end - stripes[role].read_start,
.buf = w ? stripes[role].write_buf : stripes[role].read_buf,
subops[i].op_type = OSD_OP_OUT;
subops[i].send_list.push_back(subops[i].req.buf, OSD_PACKET_SIZE);
subops[i].peer_fd =;
subops[i].req.sec_rw = {
.header = {
.id = c_cli.next_subop_id++,
.oid = {
.inode = op_data->oid.inode,
.stripe = op_data->oid.stripe | role,
.version = op_version,
.offset = w ? stripes[role].write_start : stripes[role].read_start,
.len = w ? stripes[role].write_end - stripes[role].write_start : stripes[role].read_end - stripes[role].read_start,
subops[i].buf = w ? stripes[role].write_buf : stripes[role].read_buf;
if (w && stripes[role].write_end > 0)
subops[i].send_list.push_back(stripes[role].write_buf, stripes[role].write_end - stripes[role].write_start);
subops[i].callback = [cur_op, this](osd_op_t *subop)
int fail_fd = subop->req.hdr.opcode == OSD_OP_SECONDARY_WRITE &&
subop->reply.hdr.retval != subop->req.sec_rw.len ? subop->peer_fd : -1;
// so it doesn't get freed
subop->buf = NULL;
subop->req.hdr.opcode, cur_op, subop->reply.hdr.retval,
subop->req.sec_rw.len, subop->reply.sec_rw.version
if (fail_fd >= 0)
// write operation failed, drop the connection
static uint64_t bs_op_to_osd_op[] = {
void osd_t::handle_primary_bs_subop(osd_op_t *subop)
osd_op_t *cur_op = (osd_op_t*)subop->op_type;
blockstore_op_t *bs_op = subop->bs_op;
int expected = bs_op->opcode == BS_OP_READ || bs_op->opcode == BS_OP_WRITE ? bs_op->len : 0;
if (bs_op->retval != expected && bs_op->opcode != BS_OP_READ)
// die
throw std::runtime_error(
"local blockstore modification failed (opcode = "+std::to_string(bs_op->opcode)+
" retval = "+std::to_string(bs_op->retval)+")"
uint64_t opcode = bs_op_to_osd_op[bs_op->opcode];
int retval = bs_op->retval;
uint64_t version = bs_op->version;
delete bs_op;
subop->bs_op = NULL;
handle_primary_subop(opcode, cur_op, retval, expected, version);
void osd_t::add_bs_subop_stats(osd_op_t *subop)
// Include local blockstore ops in statistics
uint64_t opcode = bs_op_to_osd_op[subop->bs_op->opcode];
timespec tv_end;
clock_gettime(CLOCK_REALTIME, &tv_end);
if (!c_cli.stats.op_stat_count[opcode])
c_cli.stats.op_stat_count[opcode] = 1;
c_cli.stats.op_stat_sum[opcode] = 0;
c_cli.stats.op_stat_bytes[opcode] = 0;
c_cli.stats.op_stat_sum[opcode] += (
(tv_end.tv_sec - subop->tv_begin.tv_sec)*1000000 +
(tv_end.tv_nsec - subop->tv_begin.tv_nsec)/1000
c_cli.stats.op_stat_bytes[opcode] += subop->bs_op->len;
void osd_t::handle_primary_subop(uint64_t opcode, osd_op_t *cur_op, int retval, int expected, uint64_t version)
osd_primary_op_data_t *op_data = cur_op->op_data;
if (retval != expected)
printf("%s subop failed: retval = %d (expected %d)\n", osd_op_names[opcode], retval, expected);
if (retval == -EPIPE)
if (op_data->fact_ver != 0 && op_data->fact_ver != version)
throw std::runtime_error(
"different fact_versions returned from "+std::string(osd_op_names[opcode])+
" subops: "+std::to_string(version)+" vs "+std::to_string(op_data->fact_ver)
op_data->fact_ver = version;
if ((op_data->errors + op_data->done) >= op_data->n_subops)
delete[] op_data->subops;
op_data->subops = NULL;
if (cur_op->req.hdr.opcode == OSD_OP_READ)
else if (cur_op->req.hdr.opcode == OSD_OP_WRITE)
else if (cur_op->req.hdr.opcode == OSD_OP_SYNC)
else if (cur_op->req.hdr.opcode == OSD_OP_DELETE)
throw std::runtime_error("BUG: unknown opcode");
void osd_t::submit_primary_del_subops(osd_op_t *cur_op, uint64_t *cur_set, pg_osd_set_t & loc_set)
osd_primary_op_data_t *op_data = cur_op->op_data;
int extra_chunks = 0;
for (auto & chunk: loc_set)
if (!cur_set || chunk.osd_num != cur_set[chunk.role])
op_data->n_subops = extra_chunks;
op_data->done = op_data->errors = 0;
if (!extra_chunks)
osd_op_t *subops = new osd_op_t[extra_chunks];
op_data->subops = subops;
int i = 0;
for (auto & chunk: loc_set)
if (!cur_set || chunk.osd_num != cur_set[chunk.role])
if (chunk.osd_num == this->osd_num)
clock_gettime(CLOCK_REALTIME, &subops[i].tv_begin);
subops[i].op_type = (uint64_t)cur_op;
subops[i].bs_op = new blockstore_op_t({
.opcode = BS_OP_DELETE,
.callback = [subop = &subops[i], this](blockstore_op_t *bs_subop)
.oid = {
.inode = op_data->oid.inode,
.stripe = op_data->oid.stripe | chunk.role,
// Same version as write
.version = op_data->fact_ver,
subops[i].op_type = OSD_OP_OUT;
subops[i].send_list.push_back(subops[i].req.buf, OSD_PACKET_SIZE);
subops[i].peer_fd =;
subops[i].req.sec_del = {
.header = {
.id = c_cli.next_subop_id++,
.oid = {
.inode = op_data->oid.inode,
.stripe = op_data->oid.stripe | chunk.role,
// Same version as write
.version = op_data->fact_ver,
subops[i].callback = [cur_op, this](osd_op_t *subop)
int fail_fd = subop->reply.hdr.retval != 0 ? subop->peer_fd : -1;
handle_primary_subop(OSD_OP_SECONDARY_DELETE, cur_op, subop->reply.hdr.retval, 0, 0);
if (fail_fd >= 0)
// delete operation failed, drop the connection
void osd_t::submit_primary_sync_subops(osd_op_t *cur_op)
osd_primary_op_data_t *op_data = cur_op->op_data;
int n_osds = op_data->unstable_write_osds->size();
osd_op_t *subops = new osd_op_t[n_osds];
op_data->done = op_data->errors = 0;
op_data->n_subops = n_osds;
op_data->subops = subops;
for (int i = 0; i < n_osds; i++)
osd_num_t sync_osd = (*(op_data->unstable_write_osds))[i].osd_num;
if (sync_osd == this->osd_num)
clock_gettime(CLOCK_REALTIME, &subops[i].tv_begin);
subops[i].op_type = (uint64_t)cur_op;
subops[i].bs_op = new blockstore_op_t({
.opcode = BS_OP_SYNC,
.callback = [subop = &subops[i], this](blockstore_op_t *bs_subop)
subops[i].op_type = OSD_OP_OUT;
subops[i].send_list.push_back(subops[i].req.buf, OSD_PACKET_SIZE);
subops[i].peer_fd =;
subops[i].req.sec_sync = {
.header = {
.id = c_cli.next_subop_id++,
subops[i].callback = [cur_op, this](osd_op_t *subop)
int fail_fd = subop->reply.hdr.retval != 0 ? subop->peer_fd : -1;
handle_primary_subop(OSD_OP_SECONDARY_SYNC, cur_op, subop->reply.hdr.retval, 0, 0);
if (fail_fd >= 0)
// sync operation failed, drop the connection
void osd_t::submit_primary_stab_subops(osd_op_t *cur_op)
osd_primary_op_data_t *op_data = cur_op->op_data;
int n_osds = op_data->unstable_write_osds->size();
osd_op_t *subops = new osd_op_t[n_osds];
op_data->done = op_data->errors = 0;
op_data->n_subops = n_osds;
op_data->subops = subops;
for (int i = 0; i < n_osds; i++)
auto & stab_osd = (*(op_data->unstable_write_osds))[i];
if (stab_osd.osd_num == this->osd_num)
clock_gettime(CLOCK_REALTIME, &subops[i].tv_begin);
subops[i].op_type = (uint64_t)cur_op;
subops[i].bs_op = new blockstore_op_t({
.opcode = BS_OP_STABLE,
.callback = [subop = &subops[i], this](blockstore_op_t *bs_subop)
.len = (uint32_t)stab_osd.len,
.buf = (void*)(op_data->unstable_writes + stab_osd.start),
subops[i].op_type = OSD_OP_OUT;
subops[i].send_list.push_back(subops[i].req.buf, OSD_PACKET_SIZE);
subops[i].peer_fd =;
subops[i].req.sec_stab = {
.header = {
.id = c_cli.next_subop_id++,
.len = (uint64_t)(stab_osd.len * sizeof(obj_ver_id)),
subops[i].send_list.push_back(op_data->unstable_writes + stab_osd.start, stab_osd.len * sizeof(obj_ver_id));
subops[i].callback = [cur_op, this](osd_op_t *subop)
int fail_fd = subop->reply.hdr.retval != 0 ? subop->peer_fd : -1;
handle_primary_subop(OSD_OP_SECONDARY_STABILIZE, cur_op, subop->reply.hdr.retval, 0, 0);
if (fail_fd >= 0)
// sync operation failed, drop the connection
void osd_t::pg_cancel_write_queue(pg_t & pg, object_id oid, int retval)
auto st_it = pg.write_queue.find(oid), it = st_it;
while (it != pg.write_queue.end() && it->first == oid)
finish_op(it->second, retval);
if (st_it != it)
pg.write_queue.erase(st_it, it);

View File

@ -0,0 +1,277 @@
#include "cluster_client.h"
void cluster_client_t::read_requests()
for (int i = 0; i < read_ready_clients.size(); i++)
int peer_fd = read_ready_clients[i];
auto & cl = clients[peer_fd];
timespec now;
clock_gettime(CLOCK_REALTIME, &now);
printf("get_sqe %s %d %ld.%06ld\n", __FILE__, __LINE__, now.tv_sec, now.tv_nsec/1000);
io_uring_sqe* sqe = ringloop->get_sqe();
if (!sqe)
read_ready_clients.erase(read_ready_clients.begin(), read_ready_clients.begin() + i);
ring_data_t* data = ((ring_data_t*)sqe->user_data);
if (!cl.read_op || cl.read_remaining < receive_buffer_size)
cl.read_iov.iov_base = cl.in_buf;
cl.read_iov.iov_len = receive_buffer_size;
cl.read_iov.iov_base = cl.read_buf;
cl.read_iov.iov_len = cl.read_remaining;
cl.read_msg.msg_iov = &cl.read_iov;
cl.read_msg.msg_iovlen = 1;
data->callback = [this, peer_fd](ring_data_t *data) { handle_read(data, peer_fd); };
my_uring_prep_recvmsg(sqe, peer_fd, &cl.read_msg, 0);
void cluster_client_t::handle_read(ring_data_t *data, int peer_fd)
auto cl_it = clients.find(peer_fd);
if (cl_it != clients.end())
auto & cl = cl_it->second;
if (data->res < 0 && data->res != -EAGAIN)
// this is a client socket, so don't panic. just disconnect it
printf("Client %d socket read error: %d (%s). Disconnecting client\n", peer_fd, -data->res, strerror(-data->res));
if (data->res == -EAGAIN || cl.read_iov.iov_base == cl.in_buf && data->res < receive_buffer_size)
if (cl.read_ready > 0)
if (data->res == -EAGAIN)
if (data->res > 0)
if (cl.read_iov.iov_base == cl.in_buf)
// Compose operation(s) from the buffer
int remain = data->res;
void *curbuf = cl.in_buf;
while (remain > 0)
if (!cl.read_op)
cl.read_op = new osd_op_t;
cl.read_op->peer_fd = peer_fd;
cl.read_op->op_type = OSD_OP_IN;
cl.read_buf = cl.read_op->req.buf;
cl.read_remaining = OSD_PACKET_SIZE;
cl.read_state = CL_READ_HDR;
if (cl.read_remaining > remain)
memcpy(cl.read_buf, curbuf, remain);
cl.read_remaining -= remain;
cl.read_buf += remain;
remain = 0;
if (cl.read_remaining <= 0)
memcpy(cl.read_buf, curbuf, cl.read_remaining);
curbuf += cl.read_remaining;
remain -= cl.read_remaining;
cl.read_remaining = 0;
cl.read_buf = NULL;
// Long data
cl.read_remaining -= data->res;
cl.read_buf += data->res;
if (cl.read_remaining <= 0)
void cluster_client_t::handle_finished_read(osd_client_t & cl)
if (cl.read_state == CL_READ_HDR)
if (cl.read_op->req.hdr.magic == SECONDARY_OSD_REPLY_MAGIC)
else if (cl.read_state == CL_READ_DATA)
// Operation is ready
cl.read_op = NULL;
cl.read_state = 0;
else if (cl.read_state == CL_READ_REPLY_DATA)
// Reply is ready
auto req_it = cl.sent_ops.find(cl.read_reply_id);
osd_op_t *request = req_it->second;
cl.read_reply_id = 0;
delete cl.read_op;
cl.read_op = NULL;
cl.read_state = 0;
// Measure subop latency
timespec tv_end;
clock_gettime(CLOCK_REALTIME, &tv_end);
if (!stats.subop_stat_count[request->req.hdr.opcode])
stats.subop_stat_sum[request->req.hdr.opcode] = 0;
stats.subop_stat_sum[request->req.hdr.opcode] += (
(tv_end.tv_sec - request->tv_begin.tv_sec)*1000000 +
(tv_end.tv_nsec - request->tv_begin.tv_nsec)/1000
void cluster_client_t::handle_op_hdr(osd_client_t *cl)
osd_op_t *cur_op = cl->read_op;
if (cur_op->req.hdr.opcode == OSD_OP_SECONDARY_READ)
if (cur_op->req.sec_rw.len > 0)
cur_op->buf = memalign(MEM_ALIGNMENT, cur_op->req.sec_rw.len);
cl->read_remaining = 0;
else if (cur_op->req.hdr.opcode == OSD_OP_SECONDARY_WRITE)
if (cur_op->req.sec_rw.len > 0)
cur_op->buf = memalign(MEM_ALIGNMENT, cur_op->req.sec_rw.len);
cl->read_remaining = cur_op->req.sec_rw.len;
else if (cur_op->req.hdr.opcode == OSD_OP_SECONDARY_STABILIZE ||
cur_op->req.hdr.opcode == OSD_OP_SECONDARY_ROLLBACK)
if (cur_op->req.sec_stab.len > 0)
cur_op->buf = memalign(MEM_ALIGNMENT, cur_op->req.sec_stab.len);
cl->read_remaining = cur_op->req.sec_stab.len;
else if (cur_op->req.hdr.opcode == OSD_OP_READ)
if (cur_op-> > 0)
cur_op->buf = memalign(MEM_ALIGNMENT, cur_op->;
cl->read_remaining = 0;
else if (cur_op->req.hdr.opcode == OSD_OP_WRITE)
if (cur_op-> > 0)
cur_op->buf = memalign(MEM_ALIGNMENT, cur_op->;
cl->read_remaining = cur_op->;
if (cl->read_remaining > 0)
// Read data
cl->read_buf = cur_op->buf;
cl->read_state = CL_READ_DATA;
// Operation is ready
cl->read_op = NULL;
cl->read_state = 0;
void cluster_client_t::handle_reply_hdr(osd_client_t *cl)
osd_op_t *cur_op = cl->read_op;
auto req_it = cl->sent_ops.find(cur_op->;
if (req_it == cl->sent_ops.end())
// Command out of sync. Drop connection
printf("Client %d command out of sync: id %lu\n", cl->peer_fd, cur_op->;
osd_op_t *op = req_it->second;
memcpy(op->reply.buf, cur_op->req.buf, OSD_PACKET_SIZE);
if (op->reply.hdr.opcode == OSD_OP_SECONDARY_READ &&
op->reply.hdr.retval > 0)
// Read data. In this case we assume that the buffer is preallocated by the caller (!)
cl->read_state = CL_READ_REPLY_DATA;
cl->read_reply_id = op->;
cl->read_buf = op->buf;
cl->read_remaining = op->reply.hdr.retval;
else if (op->reply.hdr.opcode == OSD_OP_SECONDARY_LIST &&
op->reply.hdr.retval > 0)
op->buf = memalign(MEM_ALIGNMENT, sizeof(obj_ver_id) * op->reply.hdr.retval);
cl->read_state = CL_READ_REPLY_DATA;
cl->read_reply_id = op->;
cl->read_buf = op->buf;
cl->read_remaining = sizeof(obj_ver_id) * op->reply.hdr.retval;
else if (op->reply.hdr.opcode == OSD_OP_SHOW_CONFIG &&
op->reply.hdr.retval > 0)
op->buf = malloc(op->reply.hdr.retval);
cl->read_state = CL_READ_REPLY_DATA;
cl->read_reply_id = op->;
cl->read_buf = op->buf;
cl->read_remaining = op->reply.hdr.retval;
delete cl->read_op;
cl->read_state = 0;
cl->read_op = NULL;
// Measure subop latency
timespec tv_end;
clock_gettime(CLOCK_REALTIME, &tv_end);
if (!stats.subop_stat_count[op->req.hdr.opcode])
stats.subop_stat_sum[op->req.hdr.opcode] = 0;
stats.subop_stat_sum[op->req.hdr.opcode] += (
(tv_end.tv_sec - op->tv_begin.tv_sec)*1000000 +
(tv_end.tv_nsec - op->tv_begin.tv_nsec)/1000
// Copy lambda to be unaffected by `delete op`

osd_rmw.cpp Normal file
View File

@ -0,0 +1,450 @@
#include <malloc.h>
#include <string.h>
#include <assert.h>
#include "xor.h"
#include "osd_rmw.h"
static inline void extend_read(uint32_t start, uint32_t end, osd_rmw_stripe_t & stripe)
if (stripe.read_end == 0)
stripe.read_start = start;
stripe.read_end = end;
if (stripe.read_end < end)
stripe.read_end = end;
if (stripe.read_start > start)
stripe.read_start = start;
static inline void cover_read(uint32_t start, uint32_t end, osd_rmw_stripe_t & stripe)
// Subtract <to> write request from <from> request
if (start >= stripe.req_start &&
end <= stripe.req_end)
if (start <= stripe.req_start &&
end >= stripe.req_start &&
end <= stripe.req_end)
end = stripe.req_start;
else if (start >= stripe.req_start &&
start <= stripe.req_end &&
end >= stripe.req_end)
start = stripe.req_end;
if (stripe.read_end == 0)
stripe.read_start = start;
stripe.read_end = end;
if (stripe.read_end < end)
stripe.read_end = end;
if (stripe.read_start > start)
stripe.read_start = start;
void split_stripes(uint64_t pg_minsize, uint32_t bs_block_size, uint32_t start, uint32_t end, osd_rmw_stripe_t *stripes)
if (end == 0)
// Zero length request - offset doesn't matter
end = start+end;
for (int role = 0; role < pg_minsize; role++)
if (start < (1+role)*bs_block_size && end > role*bs_block_size)
stripes[role].req_start = start < role*bs_block_size ? 0 : start-role*bs_block_size;
stripes[role].req_end = end > (role+1)*bs_block_size ? bs_block_size : end-role*bs_block_size;
void reconstruct_stripe(osd_rmw_stripe_t *stripes, int pg_size, int role)
int prev = -2;
for (int other = 0; other < pg_size; other++)
if (other != role)
if (prev == -2)
prev = other;
else if (prev >= 0)
assert(stripes[role].read_start >= stripes[prev].read_start &&
stripes[role].read_start >= stripes[other].read_start);
stripes[prev].read_buf + (stripes[role].read_start - stripes[prev].read_start),
stripes[other].read_buf + (stripes[role].read_start - stripes[other].read_start),
stripes[role].read_buf, stripes[role].read_end - stripes[role].read_start
prev = -1;
assert(stripes[role].read_start >= stripes[other].read_start);
stripes[other].read_buf + (stripes[role].read_start - stripes[other].read_start),
stripes[role].read_buf, stripes[role].read_end - stripes[role].read_start
int extend_missing_stripes(osd_rmw_stripe_t *stripes, osd_num_t *osd_set, int minsize, int size)
for (int role = 0; role < minsize; role++)
if (stripes[role].read_end != 0 && osd_set[role] == 0)
stripes[role].missing = true;
// Stripe is missing. Extend read to other stripes.
// We need at least pg_minsize stripes to recover the lost part.
// FIXME: LRC EC and similar don't require to read all other stripes.
int exist = 0;
for (int j = 0; j < size; j++)
if (osd_set[j] != 0)
extend_read(stripes[role].read_start, stripes[role].read_end, stripes[j]);
if (exist >= minsize)
if (exist < minsize)
// Less than minsize stripes are available for this object
return -1;
return 0;
void* alloc_read_buffer(osd_rmw_stripe_t *stripes, int read_pg_size, uint64_t add_size)
// Calculate buffer size
uint64_t buf_size = add_size;
for (int role = 0; role < read_pg_size; role++)
if (stripes[role].read_end != 0)
buf_size += stripes[role].read_end - stripes[role].read_start;
// Allocate buffer
void *buf = memalign(MEM_ALIGNMENT, buf_size);
uint64_t buf_pos = add_size;
for (int role = 0; role < read_pg_size; role++)
if (stripes[role].read_end != 0)
stripes[role].read_buf = buf + buf_pos;
buf_pos += stripes[role].read_end - stripes[role].read_start;
return buf;
void* calc_rmw(void *request_buf, osd_rmw_stripe_t *stripes, uint64_t *read_osd_set,
uint64_t pg_size, uint64_t pg_minsize, uint64_t pg_cursize, uint64_t *write_osd_set, uint64_t chunk_size)
// Generic parity modification (read-modify-write) algorithm
// Read -> Reconstruct missing chunks -> Calc parity chunks -> Write
// Now we always read continuous ranges. This means that an update of the beginning
// of one data stripe and the end of another will lead to a read of full paired stripes.
// FIXME: (Maybe) read small individual ranges in that case instead.
uint32_t start = 0, end = 0;
for (int role = 0; role < pg_minsize; role++)
if (stripes[role].req_end != 0)
start = !end || stripes[role].req_start < start ? stripes[role].req_start : start;
end = std::max(stripes[role].req_end, end);
stripes[role].write_start = stripes[role].req_start;
stripes[role].write_end = stripes[role].req_end;
int write_parity = 0;
for (int role = pg_minsize; role < pg_size; role++)
if (write_osd_set[role] != 0)
write_parity = 1;
stripes[role].write_start = start;
stripes[role].write_end = end;
if (write_parity)
for (int role = 0; role < pg_minsize; role++)
cover_read(start, end, stripes[role]);
if (write_osd_set != read_osd_set)
pg_cursize = 0;
// Object is degraded/misplaced and will be moved to <write_osd_set>
for (int role = 0; role < pg_size; role++)
if (write_osd_set[role] != read_osd_set[role])
// FIXME: For EC more than 2+1: handle case when write_osd_set == 0 and read_osd_set != 0
// We need to get data for any moved / recovered chunk
// And we need a continuous write buffer so we'll only optimize
// for the case when the whole chunk is ovewritten in the request
if (stripes[role].req_start != 0 ||
stripes[role].req_end != chunk_size)
stripes[role].read_start = 0;
stripes[role].read_end = chunk_size;
// Warning: We don't modify write_start/write_end here, we do it in calc_rmw_parity()
if (read_osd_set[role] != 0)
if (pg_cursize < pg_size)
// Some stripe(s) are missing, so we need to read parity
for (int role = 0; role < pg_size; role++)
if (read_osd_set[role] == 0)
stripes[role].missing = true;
if (stripes[role].read_end != 0)
int found = 0;
for (int r2 = 0; r2 < pg_size && found < pg_minsize; r2++)
// Read the non-covered range of <role> from at least <minsize> other stripes to reconstruct it
if (read_osd_set[r2] != 0)
extend_read(stripes[role].read_start, stripes[role].read_end, stripes[r2]);
if (found < pg_minsize)
// FIXME Object is incomplete - refuse partial overwrite
// Allocate read buffers
void *rmw_buf = alloc_read_buffer(stripes, pg_size, (write_parity ? pg_size-pg_minsize : 0) * (end - start));
// Position write buffers
uint64_t buf_pos = 0, in_pos = 0;
for (int role = 0; role < pg_size; role++)
if (stripes[role].req_end != 0)
stripes[role].write_buf = request_buf + in_pos;
in_pos += stripes[role].req_end - stripes[role].req_start;
else if (role >= pg_minsize && write_osd_set[role] != 0 && end != 0)
stripes[role].write_buf = rmw_buf + buf_pos;
buf_pos += end - start;
return rmw_buf;
static void get_old_new_buffers(osd_rmw_stripe_t & stripe, uint32_t wr_start, uint32_t wr_end, buf_len_t *bufs, int & nbufs)
uint32_t ns = 0, ne = 0, os = 0, oe = 0;
if (stripe.req_end > wr_start &&
stripe.req_start < wr_end)
ns = std::max(stripe.req_start, wr_start);
ne = std::min(stripe.req_end, wr_end);
if (stripe.read_end > wr_start &&
stripe.read_start < wr_end)
os = std::max(stripe.read_start, wr_start);
oe = std::min(stripe.read_end, wr_end);
if (ne && (!oe || ns <= os))
// NEW or NEW->OLD
bufs[nbufs++] = { .buf = stripe.write_buf + ns - stripe.req_start, .len = ne-ns };
if (os < ne)
os = ne;
if (oe > os)
bufs[nbufs++] = { .buf = stripe.read_buf + os - stripe.read_start, .len = oe-os };
else if (oe)
if (ne)
bufs[nbufs++] = { .buf = stripe.read_buf + os - stripe.read_start, .len = ns-os };
bufs[nbufs++] = { .buf = stripe.write_buf + ns - stripe.req_start, .len = ne-ns };
if (oe > ne)
bufs[nbufs++] = { .buf = stripe.read_buf + ne - stripe.read_start, .len = oe-ne };
// OLD
bufs[nbufs++] = { .buf = stripe.read_buf + os - stripe.read_start, .len = oe-os };
static void xor_multiple_buffers(buf_len_t *xor1, int n1, buf_len_t *xor2, int n2, void *dest, uint32_t len)
assert(n1 > 0 && n2 > 0);
int i1 = 0, i2 = 0;
uint32_t start1 = 0, start2 = 0, end1 = xor1[0].len, end2 = xor2[0].len;
uint32_t pos = 0;
while (pos < len)
// We know for sure that ranges overlap
uint32_t end = std::min(end1, end2);
memxor(xor1[i1].buf + pos-start1, xor2[i2].buf + pos-start2, dest+pos, end-pos);
pos = end;
if (pos >= end1)
if (i1 >= n1)
assert(pos >= end2);
start1 = end1;
end1 += xor1[i1].len;
if (pos >= end2)
start2 = end2;
end2 += xor2[i2].len;
void calc_rmw_parity(osd_rmw_stripe_t *stripes, int pg_size, uint64_t *read_osd_set, uint64_t *write_osd_set, uint32_t chunk_size)
int pg_minsize = pg_size-1;
for (int role = 0; role < pg_size; role++)
if (stripes[role].read_end != 0 && stripes[role].missing)
// Reconstruct missing stripe (EC k+1)
reconstruct_stripe(stripes, pg_size, role);
uint32_t start = 0, end = 0;
if (!stripes[pg_minsize].missing || write_osd_set != read_osd_set)
for (int role = 0; role < pg_minsize; role++)
if (stripes[role].req_end != 0)
start = !end || stripes[role].req_start < start ? stripes[role].req_start : start;
end = std::max(stripes[role].req_end, end);
if (write_osd_set != read_osd_set)
for (int role = 0; role < pg_minsize; role++)
if (write_osd_set[role] != read_osd_set[role] &&
(stripes[role].req_start != 0 || stripes[role].req_end != chunk_size))
// FIXME again, handle case when write_osd_set[role] is 0
// Copy modified chunk into the read buffer to write it back
stripes[role].read_buf + stripes[role].req_start,
stripes[role].req_end - stripes[role].req_start
stripes[role].write_buf = stripes[role].read_buf;
stripes[role].write_start = 0;
stripes[role].write_end = chunk_size;
if (!stripes[pg_minsize].missing && end != 0)
// Calculate new parity (EC k+1)
int parity = pg_minsize, prev = -2;
for (int other = 0; other < pg_minsize; other++)
if (prev == -2)
prev = other;
int n1 = 0, n2 = 0;
buf_len_t xor1[3], xor2[3];
if (prev == -1)
xor1[n1++] = { .buf = stripes[parity].write_buf, .len = end-start };
get_old_new_buffers(stripes[prev], start, end, xor1, n1);
prev = -1;
get_old_new_buffers(stripes[other], start, end, xor2, n2);
xor_multiple_buffers(xor1, n1, xor2, n2, stripes[parity].write_buf, end-start);
if (write_osd_set != read_osd_set)
for (int role = pg_minsize; role < pg_size; role++)
if (write_osd_set[role] != read_osd_set[role] && (start != 0 || end != chunk_size))
// Copy new parity into the read buffer to write it back
stripes[role].read_buf + start,
end - start
stripes[role].write_buf = stripes[role].read_buf;
stripes[role].write_start = 0;
stripes[role].write_end = chunk_size;

View File

@ -0,0 +1,37 @@
#pragma once
#include <stdint.h>
#include "object_id.h"
#include "osd_id.h"
#define MEM_ALIGNMENT 512
struct buf_len_t
void *buf;
uint64_t len;
struct osd_rmw_stripe_t
void *read_buf, *write_buf;
uint32_t req_start, req_end;
uint32_t read_start, read_end;
uint32_t write_start, write_end;
bool missing;
void split_stripes(uint64_t pg_minsize, uint32_t bs_block_size, uint32_t start, uint32_t len, osd_rmw_stripe_t *stripes);
void reconstruct_stripe(osd_rmw_stripe_t *stripes, int pg_size, int role);
int extend_missing_stripes(osd_rmw_stripe_t *stripes, osd_num_t *osd_set, int minsize, int size);
void* alloc_read_buffer(osd_rmw_stripe_t *stripes, int read_pg_size, uint64_t add_size);
void* calc_rmw(void *request_buf, osd_rmw_stripe_t *stripes, uint64_t *read_osd_set,
uint64_t pg_size, uint64_t pg_minsize, uint64_t pg_cursize, uint64_t *write_osd_set, uint64_t chunk_size);
void calc_rmw_parity(osd_rmw_stripe_t *stripes, int pg_size, uint64_t *read_osd_set, uint64_t *write_osd_set, uint32_t chunk_size);

View File

@ -0,0 +1,360 @@
#include <string.h>
#include "osd_rmw.cpp"
#include "test_pattern.h"
void dump_stripes(osd_rmw_stripe_t *stripes, int pg_size);
void test1();
void test4();
void test5();
void test6();
void test7();
void test8();
void test9();
1. split(offset=128K-4K, len=8K)
= [ [ 128K-4K, 128K ], [ 0, 4K ], [ 0, 0 ] ]
2. read(offset=128K-4K, len=8K, osd_set=[1,0,3])
= { read: [ [ 0, 128K ], [ 0, 4K ], [ 0, 4K ] ] }
3. cover_read(0, 128K, { req: [ 128K-4K, 4K ] })
= { read: [ 0, 128K-4K ] }
4. write(offset=128K-4K, len=8K, osd_set=[1,0,3])
= {
read: [ [ 0, 128K ], [ 4K, 128K ], [ 4K, 128K ] ],
write: [ [ 128K-4K, 128K ], [ 0, 4K ], [ 0, 128K ] ],
input buffer: [ write0, write1 ],
rmw buffer: [ write2, read0, read1, read2 ],
+ check write2 buffer
5. write(offset=0, len=128K+64K, osd_set=[1,0,3])
= {
req: [ [ 0, 128K ], [ 0, 64K ], [ 0, 0 ] ],
read: [ [ 64K, 128K ], [ 64K, 128K ], [ 64K, 128K ] ],
write: [ [ 0, 128K ], [ 0, 64K ], [ 0, 128K ] ],
input buffer: [ write0, write1 ],
rmw buffer: [ write2, read0, read1, read2 ],
6. write(offset=0, len=128K+64K, osd_set=[1,2,3])
= {
req: [ [ 0, 128K ], [ 0, 64K ], [ 0, 0 ] ],
read: [ [ 0, 0 ], [ 64K, 128K ], [ 0, 0 ] ],
write: [ [ 0, 128K ], [ 0, 64K ], [ 0, 128K ] ],
input buffer: [ write0, write1 ],
rmw buffer: [ write2, read1 ],
7. calc_rmw(offset=128K-4K, len=8K, osd_set=[1,0,3], write_set=[1,2,3])
= {
read: [ [ 0, 128K ], [ 0, 128K ], [ 0, 128K ] ],
write: [ [ 128K-4K, 128K ], [ 0, 4K ], [ 0, 128K ] ],
input buffer: [ write0, write1 ],
rmw buffer: [ write2, read0, read1, read2 ],
then, after calc_rmw_parity(): {
write: [ [ 128K-4K, 128K ], [ 0, 128K ], [ 0, 128K ] ],
+ check write1 buffer
+ check write2 buffer
8. calc_rmw(offset=0, len=128K+4K, osd_set=[0,2,3], write_set=[1,2,3])
= {
read: [ [ 0, 0 ], [ 4K, 128K ], [ 0, 0 ] ],
write: [ [ 0, 128K ], [ 0, 4K ], [ 0, 128K ] ],
input buffer: [ write0, write1 ],
rmw buffer: [ write2, read1 ],
+ check write2 buffer
9. object recovery case:
calc_rmw(offset=0, len=0, read_osd_set=[0,2,3], write_osd_set=[1,2,3])
= {
read: [ [ 0, 128K ], [ 0, 128K ], [ 0, 128K ] ],
write: [ [ 0, 0 ], [ 0, 0 ], [ 0, 0 ] ],
input buffer: NULL,
rmw buffer: [ read0, read1, read2 ],
then, after calc_rmw_parity(): {
write: [ [ 0, 128K ], [ 0, 0 ], [ 0, 0 ] ],
+ check write0 buffer
int main(int narg, char *args[])
// Test 1
// Test 4
// Test 5
// Test 6
// Test 7
// Test 8
// Test 9
// End
printf("all ok\n");
return 0;
void dump_stripes(osd_rmw_stripe_t *stripes, int pg_size)
for (int i = 0; i < pg_size; i++)
printf(" {%uK-%uK}", stripes[i].req_start/1024, stripes[i].req_end/1024);
for (int i = 0; i < pg_size; i++)
printf(" {%uK-%uK}", stripes[i].read_start/1024, stripes[i].read_end/1024);
for (int i = 0; i < pg_size; i++)
printf(" {%uK-%uK}", stripes[i].write_start/1024, stripes[i].write_end/1024);
void test1()
osd_num_t osd_set[3] = { 1, 0, 3 };
osd_rmw_stripe_t stripes[3] = { 0 };
// Test 1.1
split_stripes(2, 128*1024, 128*1024-4096, 8192, stripes);
assert(stripes[0].req_start == 128*1024-4096 && stripes[0].req_end == 128*1024);
assert(stripes[1].req_start == 0 && stripes[1].req_end == 4096);
assert(stripes[2].req_end == 0);
// Test 1.2
for (int i = 0; i < 3; i++)
stripes[i].read_start = stripes[i].req_start;
stripes[i].read_end = stripes[i].req_end;
assert(extend_missing_stripes(stripes, osd_set, 2, 3) == 0);
assert(stripes[0].read_start == 0 && stripes[0].read_end == 128*1024);
assert(stripes[2].read_start == 0 && stripes[2].read_end == 4096);
// Test 1.3
stripes[0] = { .req_start = 128*1024-4096, .req_end = 128*1024 };
cover_read(0, 128*1024, stripes[0]);
assert(stripes[0].read_start == 0 && stripes[0].read_end == 128*1024-4096);
void test4()
osd_num_t osd_set[3] = { 1, 0, 3 };
osd_rmw_stripe_t stripes[3] = { 0 };
// Test 4.1
split_stripes(2, 128*1024, 128*1024-4096, 8192, stripes);
void* write_buf = malloc(8192);
void* rmw_buf = calc_rmw(write_buf, stripes, osd_set, 3, 2, 2, osd_set, 128*1024);
assert(stripes[0].read_start == 0 && stripes[0].read_end == 128*1024);
assert(stripes[1].read_start == 4096 && stripes[1].read_end == 128*1024);
assert(stripes[2].read_start == 4096 && stripes[2].read_end == 128*1024);
assert(stripes[0].write_start == 128*1024-4096 && stripes[0].write_end == 128*1024);
assert(stripes[1].write_start == 0 && stripes[1].write_end == 4096);
assert(stripes[2].write_start == 0 && stripes[2].write_end == 128*1024);
assert(stripes[0].read_buf == rmw_buf+128*1024);
assert(stripes[1].read_buf == rmw_buf+128*1024*2);
assert(stripes[2].read_buf == rmw_buf+128*1024*3-4096);
assert(stripes[0].write_buf == write_buf);
assert(stripes[1].write_buf == write_buf+4096);
assert(stripes[2].write_buf == rmw_buf);
// Test 4.2
set_pattern(write_buf, 8192, PATTERN0);
set_pattern(stripes[0].read_buf, 128*1024, PATTERN1); // old data
set_pattern(stripes[1].read_buf, 128*1024-4096, UINT64_MAX); // didn't read it, it's missing
set_pattern(stripes[2].read_buf, 128*1024-4096, 0); // old parity = 0
calc_rmw_parity(stripes, 3, osd_set, osd_set, 128*1024);
check_pattern(stripes[2].write_buf, 4096, PATTERN0^PATTERN1); // new parity
check_pattern(stripes[2].write_buf+4096, 128*1024-4096*2, 0); // new parity
check_pattern(stripes[2].write_buf+128*1024-4096, 4096, PATTERN0^PATTERN1); // new parity
void test5()
osd_num_t osd_set[3] = { 1, 0, 3 };
osd_rmw_stripe_t stripes[3] = { 0 };
// Test 5.1
split_stripes(2, 128*1024, 0, 64*1024*3, stripes);
assert(stripes[0].req_start == 0 && stripes[0].req_end == 128*1024);
assert(stripes[1].req_start == 0 && stripes[1].req_end == 64*1024);
assert(stripes[2].req_end == 0);
// Test 5.2
void *write_buf = malloc(64*1024*3);
void *rmw_buf = calc_rmw(write_buf, stripes, osd_set, 3, 2, 2, osd_set, 128*1024);
assert(stripes[0].read_start == 64*1024 && stripes[0].read_end == 128*1024);
assert(stripes[1].read_start == 64*1024 && stripes[1].read_end == 128*1024);
assert(stripes[2].read_start == 64*1024 && stripes[2].read_end == 128*1024);
assert(stripes[0].write_start == 0 && stripes[0].write_end == 128*1024);
assert(stripes[1].write_start == 0 && stripes[1].write_end == 64*1024);
assert(stripes[2].write_start == 0 && stripes[2].write_end == 128*1024);
assert(stripes[0].read_buf == rmw_buf+128*1024);
assert(stripes[1].read_buf == rmw_buf+64*3*1024);
assert(stripes[2].read_buf == rmw_buf+64*4*1024);
assert(stripes[0].write_buf == write_buf);
assert(stripes[1].write_buf == write_buf+128*1024);
assert(stripes[2].write_buf == rmw_buf);
void test6()
osd_num_t osd_set[3] = { 1, 2, 3 };
osd_rmw_stripe_t stripes[3] = { 0 };
// Test 6.1
split_stripes(2, 128*1024, 0, 64*1024*3, stripes);
void *write_buf = malloc(64*1024*3);
void *rmw_buf = calc_rmw(write_buf, stripes, osd_set, 3, 2, 3, osd_set, 128*1024);
assert(stripes[0].read_end == 0);
assert(stripes[1].read_start == 64*1024 && stripes[1].read_end == 128*1024);
assert(stripes[2].read_end == 0);
assert(stripes[0].write_start == 0 && stripes[0].write_end == 128*1024);
assert(stripes[1].write_start == 0 && stripes[1].write_end == 64*1024);
assert(stripes[2].write_start == 0 && stripes[2].write_end == 128*1024);
assert(stripes[0].read_buf == 0);
assert(stripes[1].read_buf == rmw_buf+128*1024);
assert(stripes[2].read_buf == 0);
assert(stripes[0].write_buf == write_buf);
assert(stripes[1].write_buf == write_buf+128*1024);
assert(stripes[2].write_buf == rmw_buf);
void test7()
osd_num_t osd_set[3] = { 1, 0, 3 };
osd_num_t write_osd_set[3] = { 1, 2, 3 };
osd_rmw_stripe_t stripes[3] = { 0 };
// Test 7.1
split_stripes(2, 128*1024, 128*1024-4096, 8192, stripes);
void *write_buf = malloc(8192);
void *rmw_buf = calc_rmw(write_buf, stripes, osd_set, 3, 2, 2, write_osd_set, 128*1024);
assert(stripes[0].read_start == 0 && stripes[0].read_end == 128*1024);
assert(stripes[1].read_start == 0 && stripes[1].read_end == 128*1024);
assert(stripes[2].read_start == 0 && stripes[2].read_end == 128*1024);
assert(stripes[0].write_start == 128*1024-4096 && stripes[0].write_end == 128*1024);
assert(stripes[1].write_start == 0 && stripes[1].write_end == 4096);
assert(stripes[2].write_start == 0 && stripes[2].write_end == 128*1024);
assert(stripes[0].read_buf == rmw_buf+128*1024);
assert(stripes[1].read_buf == rmw_buf+128*1024*2);
assert(stripes[2].read_buf == rmw_buf+128*1024*3);
assert(stripes[0].write_buf == write_buf);
assert(stripes[1].write_buf == write_buf+4096);
assert(stripes[2].write_buf == rmw_buf);
// Test 7.2
set_pattern(write_buf, 8192, PATTERN0);
set_pattern(stripes[0].read_buf, 128*1024, PATTERN1); // old data
set_pattern(stripes[1].read_buf, 128*1024, UINT64_MAX); // didn't read it, it's missing
set_pattern(stripes[2].read_buf, 128*1024, 0); // old parity = 0
calc_rmw_parity(stripes, 3, osd_set, write_osd_set, 128*1024);
assert(stripes[0].write_start == 128*1024-4096 && stripes[0].write_end == 128*1024);
assert(stripes[1].write_start == 0 && stripes[1].write_end == 128*1024);
assert(stripes[2].write_start == 0 && stripes[2].write_end == 128*1024);
assert(stripes[1].write_buf == stripes[1].read_buf);
check_pattern(stripes[1].write_buf, 4096, PATTERN0);
check_pattern(stripes[1].write_buf+4096, 128*1024-4096, PATTERN1);
check_pattern(stripes[2].write_buf, 4096, PATTERN0^PATTERN1); // new parity
check_pattern(stripes[2].write_buf+4096, 128*1024-4096*2, 0); // new parity
check_pattern(stripes[2].write_buf+128*1024-4096, 4096, PATTERN0^PATTERN1); // new parity
void test8()
osd_num_t osd_set[3] = { 0, 2, 3 };
osd_num_t write_osd_set[3] = { 1, 2, 3 };
osd_rmw_stripe_t stripes[3] = { 0 };
// Test 8.1
split_stripes(2, 128*1024, 0, 128*1024+4096, stripes);
void *write_buf = malloc(128*1024+4096);
void *rmw_buf = calc_rmw(write_buf, stripes, osd_set, 3, 2, 2, write_osd_set, 128*1024);
assert(stripes[0].read_start == 0 && stripes[0].read_end == 0);
assert(stripes[1].read_start == 4096 && stripes[1].read_end == 128*1024);
assert(stripes[2].read_start == 0 && stripes[2].read_end == 0);
assert(stripes[0].write_start == 0 && stripes[0].write_end == 128*1024);
assert(stripes[1].write_start == 0 && stripes[1].write_end == 4096);
assert(stripes[2].write_start == 0 && stripes[2].write_end == 128*1024);
assert(stripes[0].read_buf == NULL);
assert(stripes[1].read_buf == rmw_buf+128*1024);
assert(stripes[2].read_buf == NULL);
assert(stripes[0].write_buf == write_buf);
assert(stripes[1].write_buf == write_buf+128*1024);
assert(stripes[2].write_buf == rmw_buf);
// Test 8.2
set_pattern(write_buf, 128*1024+4096, PATTERN0);
set_pattern(stripes[1].read_buf, 128*1024-4096, PATTERN1);
calc_rmw_parity(stripes, 3, osd_set, write_osd_set, 128*1024);
assert(stripes[0].write_start == 0 && stripes[0].write_end == 128*1024); // recheck again
assert(stripes[1].write_start == 0 && stripes[1].write_end == 4096); // recheck again
assert(stripes[2].write_start == 0 && stripes[2].write_end == 128*1024); // recheck again
assert(stripes[0].write_buf == write_buf); // recheck again
assert(stripes[1].write_buf == write_buf+128*1024); // recheck again
assert(stripes[2].write_buf == rmw_buf); // recheck again
check_pattern(stripes[2].write_buf, 4096, 0); // new parity
check_pattern(stripes[2].write_buf+4096, 128*1024-4096, PATTERN0^PATTERN1); // new parity
void test9()
osd_num_t osd_set[3] = { 0, 2, 3 };
osd_num_t write_osd_set[3] = { 1, 2, 3 };
osd_rmw_stripe_t stripes[3] = { 0 };
// Test 9.0
split_stripes(2, 128*1024, 64*1024, 0, stripes);
assert(stripes[0].req_start == 0 && stripes[0].req_end == 0);
assert(stripes[1].req_start == 0 && stripes[1].req_end == 0);
assert(stripes[2].req_start == 0 && stripes[2].req_end == 0);
// Test 9.1
void *write_buf = NULL;
void *rmw_buf = calc_rmw(write_buf, stripes, osd_set, 3, 2, 3, write_osd_set, 128*1024);
assert(stripes[0].read_start == 0 && stripes[0].read_end == 128*1024);
assert(stripes[1].read_start == 0 && stripes[1].read_end == 128*1024);
assert(stripes[2].read_start == 0 && stripes[2].read_end == 128*1024);
assert(stripes[0].write_start == 0 && stripes[0].write_end == 0);
assert(stripes[1].write_start == 0 && stripes[1].write_end == 0);
assert(stripes[2].write_start == 0 && stripes[2].write_end == 0);
assert(stripes[0].read_buf == rmw_buf);
assert(stripes[1].read_buf == rmw_buf+128*1024);
assert(stripes[2].read_buf == rmw_buf+128*1024*2);
assert(stripes[0].write_buf == NULL);
assert(stripes[1].write_buf == NULL);
assert(stripes[2].write_buf == NULL);
// Test 8.2
set_pattern(stripes[1].read_buf, 128*1024, 0);
set_pattern(stripes[2].read_buf, 128*1024, PATTERN1);
calc_rmw_parity(stripes, 3, osd_set, write_osd_set, 128*1024);
assert(stripes[0].write_start == 0 && stripes[0].write_end == 128*1024);
assert(stripes[1].write_start == 0 && stripes[1].write_end == 0);
assert(stripes[2].write_start == 0 && stripes[2].write_end == 0);
assert(stripes[0].write_buf == rmw_buf);
assert(stripes[1].write_buf == NULL);
assert(stripes[2].write_buf == NULL);
check_pattern(stripes[0].read_buf, 128*1024, PATTERN1);
check_pattern(stripes[0].write_buf, 128*1024, PATTERN1);

@ -1,38 +1,30 @@
// License: VNPL-1.1 (see for details)
// License: VNPL-1.1 (see for details)
#include "osd.h" #include "osd.h"
#include "json11/json11.hpp" #include "json11/json11.hpp"
void osd_t::secondary_op_callback(osd_op_t *op) void osd_t::secondary_op_callback(osd_op_t *op)
{ {
if (op->req.hdr.opcode == OSD_OP_SEC_READ || if (op->req.hdr.opcode == OSD_OP_SECONDARY_READ ||
op->req.hdr.opcode == OSD_OP_SEC_WRITE || op->req.hdr.opcode == OSD_OP_SECONDARY_WRITE)
op->req.hdr.opcode == OSD_OP_SEC_WRITE_STABLE)
{ {
op->reply.sec_rw.version = op->bs_op->version; op->reply.sec_rw.version = op->bs_op->version;
} }
else if (op->req.hdr.opcode == OSD_OP_SEC_DELETE) else if (op->req.hdr.opcode == OSD_OP_SECONDARY_DELETE)
{ {
op->reply.sec_del.version = op->bs_op->version; op->reply.sec_del.version = op->bs_op->version;
} }
if (op->req.hdr.opcode == OSD_OP_SEC_READ) if (op->req.hdr.opcode == OSD_OP_SECONDARY_READ &&
op->bs_op->retval > 0)
{ {
if (op->bs_op->retval >= 0) op->send_list.push_back(op->buf, op->bs_op->retval);
op->reply.sec_rw.attr_len = clean_entry_bitmap_size;
op->reply.sec_rw.attr_len = 0;
if (op->bs_op->retval > 0)
op->iov.push_back(op->buf, op->bs_op->retval);
} }
else if (op->req.hdr.opcode == OSD_OP_SEC_LIST) else if (op->req.hdr.opcode == OSD_OP_SECONDARY_LIST)
{ {
// allocated by blockstore // allocated by blockstore
op->buf = op->bs_op->buf; op->buf = op->bs_op->buf;
if (op->bs_op->retval > 0) if (op->bs_op->retval > 0)
{ {
op->iov.push_back(op->buf, op->bs_op->retval * sizeof(obj_ver_id)); op->send_list.push_back(op->buf, op->bs_op->retval * sizeof(obj_ver_id));
} }
op->reply.sec_list.stable_count = op->bs_op->version; op->reply.sec_list.stable_count = op->bs_op->version;
} }
@ -46,40 +38,27 @@ void osd_t::exec_secondary(osd_op_t *cur_op)
{ {
cur_op->bs_op = new blockstore_op_t(); cur_op->bs_op = new blockstore_op_t();
cur_op->bs_op->callback = [this, cur_op](blockstore_op_t* bs_op) { secondary_op_callback(cur_op); }; cur_op->bs_op->callback = [this, cur_op](blockstore_op_t* bs_op) { secondary_op_callback(cur_op); };
cur_op->bs_op->opcode = (cur_op->req.hdr.opcode == OSD_OP_SEC_READ ? BS_OP_READ cur_op->bs_op->opcode = (cur_op->req.hdr.opcode == OSD_OP_SECONDARY_READ ? BS_OP_READ
: (cur_op->req.hdr.opcode == OSD_OP_SEC_WRITE ? BS_OP_WRITE : (cur_op->req.hdr.opcode == OSD_OP_SECONDARY_WRITE ? BS_OP_WRITE
: (cur_op->req.hdr.opcode == OSD_OP_SEC_WRITE_STABLE ? BS_OP_WRITE_STABLE : (cur_op->req.hdr.opcode == OSD_OP_SECONDARY_SYNC ? BS_OP_SYNC
: (cur_op->req.hdr.opcode == OSD_OP_SEC_SYNC ? BS_OP_SYNC : (cur_op->req.hdr.opcode == OSD_OP_SECONDARY_STABILIZE ? BS_OP_STABLE
: (cur_op->req.hdr.opcode == OSD_OP_SEC_STABILIZE ? BS_OP_STABLE : (cur_op->req.hdr.opcode == OSD_OP_SECONDARY_ROLLBACK ? BS_OP_ROLLBACK
: (cur_op->req.hdr.opcode == OSD_OP_SEC_ROLLBACK ? BS_OP_ROLLBACK : (cur_op->req.hdr.opcode == OSD_OP_SECONDARY_DELETE ? BS_OP_DELETE
: (cur_op->req.hdr.opcode == OSD_OP_SEC_DELETE ? BS_OP_DELETE : (cur_op->req.hdr.opcode == OSD_OP_SECONDARY_LIST ? BS_OP_LIST
: (cur_op->req.hdr.opcode == OSD_OP_SEC_LIST ? BS_OP_LIST : -1)))))));
: -1)))))))); if (cur_op->req.hdr.opcode == OSD_OP_SECONDARY_READ ||
if (cur_op->req.hdr.opcode == OSD_OP_SEC_READ || cur_op->req.hdr.opcode == OSD_OP_SECONDARY_WRITE)
cur_op->req.hdr.opcode == OSD_OP_SEC_WRITE ||
cur_op->req.hdr.opcode == OSD_OP_SEC_WRITE_STABLE)
{ {
if (cur_op->req.hdr.opcode == OSD_OP_SEC_READ)
// Allocate memory for the read operation
if (clean_entry_bitmap_size > sizeof(unsigned))
cur_op->bitmap = cur_op->rmw_buf = malloc_or_die(clean_entry_bitmap_size);
cur_op->bitmap = &cur_op->bmp_data;
if (cur_op->req.sec_rw.len > 0)
cur_op->buf = memalign_or_die(MEM_ALIGNMENT, cur_op->req.sec_rw.len);
cur_op->bs_op->oid = cur_op->req.sec_rw.oid; cur_op->bs_op->oid = cur_op->req.sec_rw.oid;
cur_op->bs_op->version = cur_op->req.sec_rw.version; cur_op->bs_op->version = cur_op->req.sec_rw.version;
cur_op->bs_op->offset = cur_op->req.sec_rw.offset; cur_op->bs_op->offset = cur_op->req.sec_rw.offset;
cur_op->bs_op->len = cur_op->req.sec_rw.len; cur_op->bs_op->len = cur_op->req.sec_rw.len;
cur_op->bs_op->buf = cur_op->buf; cur_op->bs_op->buf = cur_op->buf;
cur_op->bs_op->bitmap = cur_op->bitmap;
#ifdef OSD_STUB #ifdef OSD_STUB
cur_op->bs_op->retval = cur_op->bs_op->len; cur_op->bs_op->retval = cur_op->bs_op->len;
#endif #endif
} }
else if (cur_op->req.hdr.opcode == OSD_OP_SEC_DELETE) else if (cur_op->req.hdr.opcode == OSD_OP_SECONDARY_DELETE)
{ {
cur_op->bs_op->oid = cur_op->req.sec_del.oid; cur_op->bs_op->oid = cur_op->req.sec_del.oid;
cur_op->bs_op->version = cur_op->req.sec_del.version; cur_op->bs_op->version = cur_op->req.sec_del.version;
@ -87,8 +66,8 @@ void osd_t::exec_secondary(osd_op_t *cur_op)
cur_op->bs_op->retval = 0; cur_op->bs_op->retval = 0;
#endif #endif
} }
else if (cur_op->req.hdr.opcode == OSD_OP_SEC_STABILIZE || else if (cur_op->req.hdr.opcode == OSD_OP_SECONDARY_STABILIZE ||
cur_op->req.hdr.opcode == OSD_OP_SEC_ROLLBACK) cur_op->req.hdr.opcode == OSD_OP_SECONDARY_ROLLBACK)
{ {
cur_op->bs_op->len = cur_op->req.sec_stab.len/sizeof(obj_ver_id); cur_op->bs_op->len = cur_op->req.sec_stab.len/sizeof(obj_ver_id);
cur_op->bs_op->buf = cur_op->buf; cur_op->bs_op->buf = cur_op->buf;
@ -96,12 +75,11 @@ void osd_t::exec_secondary(osd_op_t *cur_op)
cur_op->bs_op->retval = 0; cur_op->bs_op->retval = 0;
#endif #endif
} }
else if (cur_op->req.hdr.opcode == OSD_OP_SEC_LIST) else if (cur_op->req.hdr.opcode == OSD_OP_SECONDARY_LIST)
{ {
if (cur_op->req.sec_list.pg_count < cur_op->req.sec_list.list_pg) if (cur_op->req.sec_list.pg_count < cur_op->req.sec_list.list_pg)
{ {
// requested pg number is greater than total pg count // requested pg number is greater than total pg count
printf("Invalid LIST request: pg count %u < pg number %u\n", cur_op->req.sec_list.pg_count, cur_op->req.sec_list.list_pg);
cur_op->bs_op->retval = -EINVAL; cur_op->bs_op->retval = -EINVAL;
secondary_op_callback(cur_op); secondary_op_callback(cur_op);
return; return;
@ -109,8 +87,6 @@ void osd_t::exec_secondary(osd_op_t *cur_op)
cur_op->bs_op->oid.stripe = cur_op->req.sec_list.pg_stripe_size; cur_op->bs_op->oid.stripe = cur_op->req.sec_list.pg_stripe_size;
cur_op->bs_op->len = cur_op->req.sec_list.pg_count; cur_op->bs_op->len = cur_op->req.sec_list.pg_count;
cur_op->bs_op->offset = cur_op->req.sec_list.list_pg - 1; cur_op->bs_op->offset = cur_op->req.sec_list.list_pg - 1;
cur_op->bs_op->oid.inode = cur_op->req.sec_list.min_inode;
cur_op->bs_op->version = cur_op->req.sec_list.max_inode;
#ifdef OSD_STUB #ifdef OSD_STUB
cur_op->bs_op->retval = 0; cur_op->bs_op->retval = 0;
cur_op->bs_op->buf = NULL; cur_op->bs_op->buf = NULL;
@ -127,9 +103,9 @@ void osd_t::exec_show_config(osd_op_t *cur_op)
{ {
// FIXME: Send the real config, not its source // FIXME: Send the real config, not its source
std::string cfg_str = json11::Json(config).dump(); std::string cfg_str = json11::Json(config).dump();
cur_op->buf = malloc_or_die(cfg_str.size()+1); cur_op->buf = malloc(cfg_str.size()+1);
memcpy(cur_op->buf, cfg_str.c_str(), cfg_str.size()+1); memcpy(cur_op->buf, cfg_str.c_str(), cfg_str.size()+1);
cur_op->iov.push_back(cur_op->buf, cfg_str.size()+1); cur_op->send_list.push_back(cur_op->buf, cfg_str.size()+1);
finish_op(cur_op, cfg_str.size()+1); finish_op(cur_op, cfg_str.size()+1);
} }

osd_send.cpp Normal file
View File

@ -0,0 +1,145 @@
#include "cluster_client.h"
void cluster_client_t::outbox_push(osd_op_t *cur_op)
auto & cl =>peer_fd);
if (cur_op->op_type == OSD_OP_OUT)
clock_gettime(CLOCK_REALTIME, &cur_op->tv_begin);
if (cl.write_op || cl.outbox.size() > 1 || !try_send(cl))
if (cl.write_state == 0)
cl.write_state = CL_WRITE_READY;
bool cluster_client_t::try_send(osd_client_t & cl)
int peer_fd = cl.peer_fd;
timespec now;
clock_gettime(CLOCK_REALTIME, &now);
printf("get_sqe %s %d %ld.%06ld\n", __FILE__, __LINE__, now.tv_sec, now.tv_nsec/1000);
io_uring_sqe* sqe = ringloop->get_sqe();
if (!sqe)
return false;
ring_data_t* data = ((ring_data_t*)sqe->user_data);
if (!cl.write_op)
// pick next command
cl.write_op = cl.outbox.front();
cl.write_state = CL_WRITE_REPLY;
if (cl.write_op->op_type == OSD_OP_IN)
// Measure execution latency
timespec tv_end;
clock_gettime(CLOCK_REALTIME, &tv_end);
if (!stats.op_stat_count[cl.write_op->req.hdr.opcode])
stats.op_stat_sum[cl.write_op->req.hdr.opcode] = 0;
stats.op_stat_bytes[cl.write_op->req.hdr.opcode] = 0;
stats.op_stat_sum[cl.write_op->req.hdr.opcode] += (
(tv_end.tv_sec - cl.write_op->tv_begin.tv_sec)*1000000 +
(tv_end.tv_nsec - cl.write_op->tv_begin.tv_nsec)/1000
if (cl.write_op->req.hdr.opcode == OSD_OP_READ ||
cl.write_op->req.hdr.opcode == OSD_OP_WRITE)
stats.op_stat_bytes[cl.write_op->req.hdr.opcode] += cl.write_op->;
else if (cl.write_op->req.hdr.opcode == OSD_OP_SECONDARY_READ ||
cl.write_op->req.hdr.opcode == OSD_OP_SECONDARY_WRITE)
stats.op_stat_bytes[cl.write_op->req.hdr.opcode] += cl.write_op->req.sec_rw.len;
cl.write_msg.msg_iov = cl.write_op->send_list.get_iovec();
cl.write_msg.msg_iovlen = cl.write_op->send_list.get_size();
data->callback = [this, peer_fd](ring_data_t *data) { handle_send(data, peer_fd); };
my_uring_prep_sendmsg(sqe, peer_fd, &cl.write_msg, 0);
return true;
void cluster_client_t::send_replies()
for (int i = 0; i < write_ready_clients.size(); i++)
int peer_fd = write_ready_clients[i];
if (!try_send(clients[peer_fd]))
write_ready_clients.erase(write_ready_clients.begin(), write_ready_clients.begin() + i);
void cluster_client_t::handle_send(ring_data_t *data, int peer_fd)
auto cl_it = clients.find(peer_fd);
if (cl_it != clients.end())
auto & cl = cl_it->second;
if (data->res < 0 && data->res != -EAGAIN)
// this is a client socket, so don't panic. just disconnect it
printf("Client %d socket write error: %d (%s). Disconnecting client\n", peer_fd, -data->res, strerror(-data->res));
if (data->res >= 0)
osd_op_t *cur_op = cl.write_op;
while (data->res > 0 && cur_op->send_list.sent < cur_op->send_list.count)
iovec & iov = cur_op->send_list.buf[cur_op->send_list.sent];
if (iov.iov_len <= data->res)
data->res -= iov.iov_len;
iov.iov_len -= data->res;
iov.iov_base += data->res;
if (cur_op->send_list.sent >= cur_op->send_list.count)
// Done
if (cur_op->op_type == OSD_OP_IN)
delete cur_op;
cl.sent_ops[cl.write_op->] = cl.write_op;
cl.write_op = NULL;
cl.write_state = cl.outbox.size() > 0 ? CL_WRITE_READY : 0;
if (cl.write_state != 0)

@ -1,6 +1,3 @@
// License: VNPL-1.1 (see for details)
// License: VNPL-1.1 (see for details)
#include <sys/types.h> #include <sys/types.h>
#include <sys/socket.h> #include <sys/socket.h>
#include <netinet/in.h> #include <netinet/in.h>
@ -22,8 +19,6 @@
int connect_osd(const char *osd_address, int osd_port); int connect_osd(const char *osd_address, int osd_port);
uint64_t test_read(int connect_fd, uint64_t inode, uint64_t stripe, uint64_t version, uint64_t offset, uint64_t len);
uint64_t test_write(int connect_fd, uint64_t inode, uint64_t stripe, uint64_t version, uint64_t pattern); uint64_t test_write(int connect_fd, uint64_t inode, uint64_t stripe, uint64_t version, uint64_t pattern);
void* test_primary_read(int connect_fd, uint64_t inode, uint64_t offset, uint64_t len); void* test_primary_read(int connect_fd, uint64_t inode, uint64_t offset, uint64_t len);
@ -110,7 +105,7 @@ int main3(int narg, char *args[])
return 0; return 0;
} }
int main4(int narg, char *args[]) int main(int narg, char *args[])
{ {
int connect_fd; int connect_fd;
// Cluster write (sync not implemented yet) // Cluster write (sync not implemented yet)
@ -122,15 +117,6 @@ int main4(int narg, char *args[])
return 0; return 0;
} }
int main(int narg, char *args[])
int connect_fd;
connect_fd = connect_osd("", 43051);
test_read(connect_fd, 1, 1039663104, UINT64_MAX, 0, 128*1024);
return 0;
int connect_osd(const char *osd_address, int osd_port) int connect_osd(const char *osd_address, int osd_port)
{ {
struct sockaddr_in addr; struct sockaddr_in addr;
@ -181,73 +167,13 @@ bool check_reply(int r, osd_any_op_t & op, osd_any_reply_t & reply, int expected
return true; return true;
} }
uint64_t test_read(int connect_fd, uint64_t inode, uint64_t stripe, uint64_t version, uint64_t offset, uint64_t len)
osd_any_op_t op;
osd_any_reply_t reply;
op.hdr.magic = SECONDARY_OSD_OP_MAGIC; = 1;
op.hdr.opcode = OSD_OP_SEC_READ;
op.sec_rw.oid = {
.inode = inode,
.stripe = stripe,
op.sec_rw.version = version;
op.sec_rw.offset = offset;
op.sec_rw.len = len;
void *data = memalign(MEM_ALIGNMENT, op.sec_rw.len);
write_blocking(connect_fd, op.buf, OSD_PACKET_SIZE);
int r = read_blocking(connect_fd, reply.buf, OSD_PACKET_SIZE);
if (!check_reply(r, op, reply, op.sec_rw.len))
return 0;
r = read_blocking(connect_fd, data, len);
if (r != len)
perror("read data");
return 0;
printf("Read %lx:%lx v%lu = v%lu\n", inode, stripe, version, reply.sec_rw.version);
op.hdr.opcode = OSD_OP_SEC_LIST;
op.sec_list.list_pg = 1;
op.sec_list.pg_count = 1;
op.sec_list.pg_stripe_size = 4*1024*1024;
write_blocking(connect_fd, op.buf, OSD_PACKET_SIZE);
r = read_blocking(connect_fd, reply.buf, OSD_PACKET_SIZE);
if (reply.hdr.retval < 0 || !check_reply(r, op, reply, reply.hdr.retval))
return 0;
data = memalign(MEM_ALIGNMENT, sizeof(obj_ver_id)*reply.hdr.retval);
r = read_blocking(connect_fd, data, sizeof(obj_ver_id)*reply.hdr.retval);
if (r != sizeof(obj_ver_id)*reply.hdr.retval)
perror("read data");
return 0;
obj_ver_id *ov = (obj_ver_id*)data;
for (int i = 0; i < reply.hdr.retval; i++)
if (ov[i].oid.inode == inode && (ov[i].oid.stripe & ~(4096-1)) == (stripe & ~(4096-1)))
printf("list: %lx:%lx v%lu stable=%d\n", ov[i].oid.inode, ov[i].oid.stripe, ov[i].version, i < reply.sec_list.stable_count ? 1 : 0);
return 0;
uint64_t test_write(int connect_fd, uint64_t inode, uint64_t stripe, uint64_t version, uint64_t pattern) uint64_t test_write(int connect_fd, uint64_t inode, uint64_t stripe, uint64_t version, uint64_t pattern)
{ {
osd_any_op_t op; osd_any_op_t op;
osd_any_reply_t reply; osd_any_reply_t reply;
op.hdr.magic = SECONDARY_OSD_OP_MAGIC; op.hdr.magic = SECONDARY_OSD_OP_MAGIC; = 1; = 1;
op.hdr.opcode = OSD_OP_SEC_WRITE; op.hdr.opcode = OSD_OP_SECONDARY_WRITE;
op.sec_rw.oid = { op.sec_rw.oid = {
.inode = inode, .inode = inode,
.stripe = stripe, .stripe = stripe,
@ -357,7 +283,7 @@ void test_list_stab(int connect_fd)
osd_any_reply_t reply; osd_any_reply_t reply;
op.hdr.magic = SECONDARY_OSD_OP_MAGIC; op.hdr.magic = SECONDARY_OSD_OP_MAGIC; = 1; = 1;
op.hdr.opcode = OSD_OP_SEC_LIST; op.hdr.opcode = OSD_OP_SECONDARY_LIST;
op.sec_list.pg_count = 0; op.sec_list.pg_count = 0;
assert(write_blocking(connect_fd, op.buf, OSD_PACKET_SIZE) == OSD_PACKET_SIZE); assert(write_blocking(connect_fd, op.buf, OSD_PACKET_SIZE) == OSD_PACKET_SIZE);
int r = read_blocking(connect_fd, reply.buf, OSD_PACKET_SIZE); int r = read_blocking(connect_fd, reply.buf, OSD_PACKET_SIZE);
@ -373,7 +299,7 @@ void test_list_stab(int connect_fd)
// Stabilize in portions of 32 entries // Stabilize in portions of 32 entries
if (i - last_start >= 32 || i == total_count) if (i - last_start >= 32 || i == total_count)
{ {
op.hdr.opcode = OSD_OP_SEC_STABILIZE; op.hdr.opcode = OSD_OP_SECONDARY_STABILIZE;
op.sec_stab.len = sizeof(obj_ver_id) * (i - last_start); op.sec_stab.len = sizeof(obj_ver_id) * (i - last_start);
assert(write_blocking(connect_fd, op.buf, OSD_PACKET_SIZE) == OSD_PACKET_SIZE); assert(write_blocking(connect_fd, op.buf, OSD_PACKET_SIZE) == OSD_PACKET_SIZE);
assert(write_blocking(connect_fd, data + last_start, op.sec_stab.len) == op.sec_stab.len); assert(write_blocking(connect_fd, data + last_start, op.sec_stab.len) == op.sec_stab.len);

@ -1,11 +1,8 @@
// License: VNPL-1.1 or GNU GPL-2.0+ (see for details)
// License: VNPL-1.1 or GNU GPL-2.0+ (see for details)
#include "pg_states.h" #include "pg_states.h"
const int pg_state_bit_count = 14; const int pg_state_bit_count = 13;
const int pg_state_bits[14] = { const int pg_state_bits[13] = {
@ -17,11 +14,10 @@ const int pg_state_bits[14] = {
}; };
const char *pg_state_names[14] = { const char *pg_state_names[13] = {
"starting", "starting",
"peering", "peering",
"incomplete", "incomplete",
@ -33,6 +29,5 @@ const char *pg_state_names[14] = {
"has_degraded", "has_degraded",
"has_misplaced", "has_misplaced",
"has_unclean", "has_unclean",
"left_on_dead", "left_on_dead",
}; };

@ -1,6 +1,3 @@
// License: VNPL-1.1 or GNU GPL-2.0+ (see for details)
// License: VNPL-1.1 or GNU GPL-2.0+ (see for details)
#pragma once #pragma once
// Placement group states // Placement group states
@ -18,11 +15,9 @@
#define PG_HAS_DEGRADED (1<<8) #define PG_HAS_DEGRADED (1<<8)
#define PG_HAS_MISPLACED (1<<9) #define PG_HAS_MISPLACED (1<<9)
#define PG_HAS_UNCLEAN (1<<10) #define PG_HAS_UNCLEAN (1<<10)
#define PG_HAS_INVALID (1<<11) #define PG_LEFT_ON_DEAD (1<<11)
#define PG_LEFT_ON_DEAD (1<<12)
// Lower bits that represent object role (EC 0/1/2... or always 0 with replication) // FIXME: Safe default that doesn't depend on pg_stripe_size or pg_block_size
// 12 bits is a safe default that doesn't depend on pg_stripe_size or pg_block_size
#define STRIPE_MASK ((uint64_t)4096 - 1) #define STRIPE_MASK ((uint64_t)4096 - 1)
// OSD object states // OSD object states
@ -31,6 +26,7 @@
#define OBJ_MISPLACED 0x08 #define OBJ_MISPLACED 0x08
#define OBJ_NEEDS_STABLE 0x10000 #define OBJ_NEEDS_STABLE 0x10000
#define OBJ_NEEDS_ROLLBACK 0x20000 #define OBJ_NEEDS_ROLLBACK 0x20000
#define OBJ_BUGGY 0x80000
extern const int pg_state_bits[]; extern const int pg_state_bits[];
extern const char *pg_state_names[]; extern const char *pg_state_names[];

Some files were not shown because too many files have changed in this diff Show More