2020-09-17 23:02:40 +03:00
|
|
|
// Copyright (c) Vitaliy Filippov, 2019+
|
2021-02-06 01:26:07 +03:00
|
|
|
// License: VNPL-1.1 (see README.md for details)
|
2020-09-17 23:02:40 +03:00
|
|
|
|
2019-11-03 01:34:29 +03:00
|
|
|
#pragma once
|
|
|
|
|
2019-11-04 15:46:33 +03:00
|
|
|
#include "crc32c.h"
|
2021-12-15 02:43:12 +03:00
|
|
|
#include <set>
|
2019-11-04 15:46:33 +03:00
|
|
|
|
2019-11-03 01:34:29 +03:00
|
|
|
#define MIN_JOURNAL_SIZE 4*1024*1024
|
|
|
|
#define JOURNAL_MAGIC 0x4A33
|
2021-04-10 02:14:17 +03:00
|
|
|
#define JOURNAL_VERSION 1
|
2019-11-04 15:46:33 +03:00
|
|
|
#define JOURNAL_BUFFER_SIZE 4*1024*1024
|
2019-11-03 01:34:29 +03:00
|
|
|
|
2020-01-10 20:05:17 +03:00
|
|
|
// We reserve some extra space for future stabilize requests during writes
|
2020-10-22 02:44:16 +03:00
|
|
|
// FIXME: This value should be dynamic i.e. Blockstore ideally shouldn't allow
|
|
|
|
// writing more than can be stabilized afterwards
|
2020-01-10 20:05:17 +03:00
|
|
|
#define JOURNAL_STABILIZE_RESERVATION 65536
|
|
|
|
|
2019-11-03 01:34:29 +03:00
|
|
|
// Journal entries
|
|
|
|
// Journal entries are linked to each other by their crc32 value
|
|
|
|
// The journal is almost a blockchain, because object versions constantly increase
|
2020-03-13 21:41:54 +03:00
|
|
|
#define JE_MIN 0x01
|
2019-11-03 01:34:29 +03:00
|
|
|
#define JE_START 0x01
|
|
|
|
#define JE_SMALL_WRITE 0x02
|
|
|
|
#define JE_BIG_WRITE 0x03
|
|
|
|
#define JE_STABLE 0x04
|
|
|
|
#define JE_DELETE 0x05
|
2020-01-24 20:10:18 +03:00
|
|
|
#define JE_ROLLBACK 0x06
|
2020-07-05 01:48:02 +03:00
|
|
|
#define JE_SMALL_WRITE_INSTANT 0x07
|
|
|
|
#define JE_BIG_WRITE_INSTANT 0x08
|
|
|
|
#define JE_MAX 0x08
|
2019-11-03 01:34:29 +03:00
|
|
|
|
2019-11-04 01:42:40 +03:00
|
|
|
// crc32c comes first to ease calculation and is equal to crc32()
|
2019-11-03 01:34:29 +03:00
|
|
|
struct __attribute__((__packed__)) journal_entry_start
|
|
|
|
{
|
2019-11-04 01:42:40 +03:00
|
|
|
uint32_t crc32;
|
2019-11-03 01:34:29 +03:00
|
|
|
uint16_t magic;
|
|
|
|
uint16_t type;
|
|
|
|
uint32_t size;
|
2019-11-15 14:09:41 +03:00
|
|
|
uint32_t reserved;
|
2019-11-04 01:42:40 +03:00
|
|
|
uint64_t journal_start;
|
2021-04-10 02:14:17 +03:00
|
|
|
uint64_t version;
|
2019-11-03 01:34:29 +03:00
|
|
|
};
|
2021-04-10 02:14:17 +03:00
|
|
|
#define JE_START_LEGACY_SIZE 24
|
2019-11-03 01:34:29 +03:00
|
|
|
|
|
|
|
struct __attribute__((__packed__)) journal_entry_small_write
|
|
|
|
{
|
2019-11-04 01:42:40 +03:00
|
|
|
uint32_t crc32;
|
2019-11-03 01:34:29 +03:00
|
|
|
uint16_t magic;
|
|
|
|
uint16_t type;
|
|
|
|
uint32_t size;
|
|
|
|
uint32_t crc32_prev;
|
|
|
|
object_id oid;
|
|
|
|
uint64_t version;
|
|
|
|
uint32_t offset;
|
|
|
|
uint32_t len;
|
2019-11-20 00:46:44 +03:00
|
|
|
// small_write entries contain <len> bytes of data which is stored in next sectors
|
|
|
|
// data_offset is its offset within journal
|
|
|
|
uint64_t data_offset;
|
2019-11-27 02:20:38 +03:00
|
|
|
uint32_t crc32_data;
|
2021-01-10 01:54:58 +03:00
|
|
|
// small_write and big_write entries are followed by the "external" bitmap
|
|
|
|
// its size is dynamic and included in journal entry's <size> field
|
|
|
|
uint8_t bitmap[];
|
2019-11-03 01:34:29 +03:00
|
|
|
};
|
|
|
|
|
|
|
|
struct __attribute__((__packed__)) journal_entry_big_write
|
|
|
|
{
|
2019-11-04 01:42:40 +03:00
|
|
|
uint32_t crc32;
|
2019-11-03 01:34:29 +03:00
|
|
|
uint16_t magic;
|
|
|
|
uint16_t type;
|
|
|
|
uint32_t size;
|
|
|
|
uint32_t crc32_prev;
|
|
|
|
object_id oid;
|
|
|
|
uint64_t version;
|
2020-01-12 19:48:03 +03:00
|
|
|
uint32_t offset;
|
|
|
|
uint32_t len;
|
2019-11-12 20:55:17 +03:00
|
|
|
uint64_t location;
|
2021-01-10 01:54:58 +03:00
|
|
|
// small_write and big_write entries are followed by the "external" bitmap
|
|
|
|
// its size is dynamic and included in journal entry's <size> field
|
|
|
|
uint8_t bitmap[];
|
2019-11-03 01:34:29 +03:00
|
|
|
};
|
|
|
|
|
|
|
|
struct __attribute__((__packed__)) journal_entry_stable
|
|
|
|
{
|
2019-11-04 01:42:40 +03:00
|
|
|
uint32_t crc32;
|
2019-11-03 01:34:29 +03:00
|
|
|
uint16_t magic;
|
|
|
|
uint16_t type;
|
|
|
|
uint32_t size;
|
|
|
|
uint32_t crc32_prev;
|
|
|
|
object_id oid;
|
|
|
|
uint64_t version;
|
|
|
|
};
|
|
|
|
|
2020-01-24 20:10:18 +03:00
|
|
|
struct __attribute__((__packed__)) journal_entry_rollback
|
|
|
|
{
|
|
|
|
uint32_t crc32;
|
|
|
|
uint16_t magic;
|
|
|
|
uint16_t type;
|
|
|
|
uint32_t size;
|
|
|
|
uint32_t crc32_prev;
|
|
|
|
object_id oid;
|
|
|
|
uint64_t version;
|
|
|
|
};
|
|
|
|
|
2019-11-03 01:34:29 +03:00
|
|
|
struct __attribute__((__packed__)) journal_entry_del
|
|
|
|
{
|
2019-11-04 01:42:40 +03:00
|
|
|
uint32_t crc32;
|
2019-11-03 01:34:29 +03:00
|
|
|
uint16_t magic;
|
|
|
|
uint16_t type;
|
|
|
|
uint32_t size;
|
|
|
|
uint32_t crc32_prev;
|
|
|
|
object_id oid;
|
|
|
|
uint64_t version;
|
|
|
|
};
|
|
|
|
|
|
|
|
struct __attribute__((__packed__)) journal_entry
|
|
|
|
{
|
|
|
|
union
|
|
|
|
{
|
|
|
|
struct __attribute__((__packed__))
|
|
|
|
{
|
2019-11-04 01:42:40 +03:00
|
|
|
uint32_t crc32;
|
2019-11-03 01:34:29 +03:00
|
|
|
uint16_t magic;
|
|
|
|
uint16_t type;
|
|
|
|
uint32_t size;
|
2019-11-04 01:42:40 +03:00
|
|
|
uint32_t crc32_prev;
|
2019-11-03 01:34:29 +03:00
|
|
|
};
|
|
|
|
journal_entry_start start;
|
|
|
|
journal_entry_small_write small_write;
|
|
|
|
journal_entry_big_write big_write;
|
|
|
|
journal_entry_stable stable;
|
2020-01-24 20:10:18 +03:00
|
|
|
journal_entry_rollback rollback;
|
2019-11-03 01:34:29 +03:00
|
|
|
journal_entry_del del;
|
|
|
|
};
|
|
|
|
};
|
2019-11-04 15:46:33 +03:00
|
|
|
|
|
|
|
inline uint32_t je_crc32(journal_entry *je)
|
|
|
|
{
|
2019-11-25 02:14:06 +03:00
|
|
|
// 0x48674bc7 = crc32(4 zero bytes)
|
|
|
|
return crc32c(0x48674bc7, ((uint8_t*)je)+4, je->size-4);
|
2019-11-04 15:46:33 +03:00
|
|
|
}
|
2019-11-07 16:58:30 +03:00
|
|
|
|
|
|
|
struct journal_sector_info_t
|
|
|
|
{
|
|
|
|
uint64_t offset;
|
2021-02-02 01:32:13 +03:00
|
|
|
uint64_t flush_count;
|
2020-09-12 19:14:51 +03:00
|
|
|
bool written;
|
2020-01-15 01:55:30 +03:00
|
|
|
bool dirty;
|
2021-12-15 02:43:12 +03:00
|
|
|
uint64_t submit_id;
|
2019-11-07 16:58:30 +03:00
|
|
|
};
|
|
|
|
|
2021-12-15 02:43:12 +03:00
|
|
|
struct pending_journaling_t
|
|
|
|
{
|
|
|
|
uint64_t flush_id;
|
|
|
|
int sector;
|
|
|
|
blockstore_op_t *op;
|
|
|
|
};
|
|
|
|
|
|
|
|
inline bool operator < (const pending_journaling_t & a, const pending_journaling_t & b)
|
|
|
|
{
|
|
|
|
return a.flush_id < b.flush_id || a.flush_id == b.flush_id && a.op < b.op;
|
|
|
|
}
|
|
|
|
|
2019-11-07 16:58:30 +03:00
|
|
|
struct journal_t
|
|
|
|
{
|
|
|
|
int fd;
|
|
|
|
uint64_t device_size;
|
2019-11-28 14:41:03 +03:00
|
|
|
bool inmemory = false;
|
2021-04-10 02:23:55 +03:00
|
|
|
bool flush_journal = false;
|
2019-11-28 14:41:03 +03:00
|
|
|
void *buffer = NULL;
|
2019-11-07 16:58:30 +03:00
|
|
|
|
2020-04-14 19:19:56 +03:00
|
|
|
uint64_t block_size;
|
2019-11-07 16:58:30 +03:00
|
|
|
uint64_t offset, len;
|
2020-03-07 17:36:58 +03:00
|
|
|
// Next free block offset
|
2020-01-16 00:35:35 +03:00
|
|
|
uint64_t next_free = 0;
|
2020-03-07 17:36:58 +03:00
|
|
|
// First occupied block offset
|
2020-01-16 00:35:35 +03:00
|
|
|
uint64_t used_start = 0;
|
2020-03-07 17:36:58 +03:00
|
|
|
// End of the last block not used for writing anymore
|
|
|
|
uint64_t dirty_start = 0;
|
2019-11-07 16:58:30 +03:00
|
|
|
uint32_t crc32_last = 0;
|
|
|
|
|
|
|
|
// Current sector(s) used for writing
|
2019-11-28 14:41:03 +03:00
|
|
|
void *sector_buf = NULL;
|
|
|
|
journal_sector_info_t *sector_info = NULL;
|
2019-11-07 16:58:30 +03:00
|
|
|
uint64_t sector_count;
|
2020-09-12 19:14:51 +03:00
|
|
|
bool no_same_sector_overwrites = false;
|
2019-11-11 00:28:14 +03:00
|
|
|
int cur_sector = 0;
|
2020-01-16 00:35:35 +03:00
|
|
|
int in_sector_pos = 0;
|
2021-12-15 02:43:12 +03:00
|
|
|
std::vector<int> submitting_sectors;
|
|
|
|
std::set<pending_journaling_t> flushing_ops;
|
|
|
|
uint64_t submit_id = 0;
|
2019-11-14 21:15:59 +03:00
|
|
|
|
|
|
|
// Used sector map
|
|
|
|
// May use ~ 80 MB per 1 GB of used journal space in the worst case
|
|
|
|
std::map<uint64_t, uint64_t> used_sectors;
|
2019-11-28 14:41:03 +03:00
|
|
|
|
|
|
|
~journal_t();
|
2019-11-29 02:13:30 +03:00
|
|
|
bool trim();
|
2020-10-24 00:27:03 +03:00
|
|
|
uint64_t get_trim_pos();
|
2021-07-17 16:13:41 +03:00
|
|
|
void dump_diagnostics();
|
2021-02-02 01:29:11 +03:00
|
|
|
inline bool entry_fits(int size)
|
|
|
|
{
|
|
|
|
return !(block_size - in_sector_pos < size ||
|
|
|
|
no_same_sector_overwrites && sector_info[cur_sector].written);
|
|
|
|
}
|
2019-11-07 16:58:30 +03:00
|
|
|
};
|
2019-11-11 00:28:14 +03:00
|
|
|
|
|
|
|
struct blockstore_journal_check_t
|
|
|
|
{
|
2019-12-15 14:49:10 +03:00
|
|
|
blockstore_impl_t *bs;
|
2019-11-11 00:28:14 +03:00
|
|
|
uint64_t next_pos, next_sector, next_in_pos;
|
2021-02-02 01:30:23 +03:00
|
|
|
int sectors_to_write, first_sector;
|
2019-11-27 11:35:11 +03:00
|
|
|
bool right_dir; // writing to the end or the beginning of the ring buffer
|
2019-11-11 00:28:14 +03:00
|
|
|
|
2019-12-15 14:49:10 +03:00
|
|
|
blockstore_journal_check_t(blockstore_impl_t *bs);
|
2019-12-15 14:11:03 +03:00
|
|
|
int check_available(blockstore_op_t *op, int required, int size, int data_after);
|
2019-11-11 00:28:14 +03:00
|
|
|
};
|
|
|
|
|
2019-11-19 18:07:40 +03:00
|
|
|
journal_entry* prefill_single_journal_entry(journal_t & journal, uint16_t type, uint32_t size);
|