forked from vitalif/vitastor
Compare commits
9 Commits
Author | SHA1 | Date | |
---|---|---|---|
28be049909 | |||
78fbaacf1f | |||
1526c5a213 | |||
c7cc414c90 | |||
f4ea313707 | |||
b88b76f316 | |||
4a17a61d1f | |||
ccabbbfbcb | |||
26dac57083 |
12
README.md
12
README.md
@@ -16,7 +16,8 @@ breaking changes in the future. However, the following is implemented:
|
||||
|
||||
- Basic part: highly-available block storage with symmetric clustering and no SPOF
|
||||
- Performance ;-D
|
||||
- Two redundancy schemes: Replication and XOR n+1 (simplest case of EC)
|
||||
- Multiple redundancy schemes: Replication, XOR n+1, Reed-Solomon erasure codes
|
||||
based on jerasure library with any number of data and parity drives in a group
|
||||
- Configuration via simple JSON data structures in etcd
|
||||
- Automatic data distribution over OSDs, with support for:
|
||||
- Mathematical optimization for better uniformity and less data movement
|
||||
@@ -39,8 +40,6 @@ breaking changes in the future. However, the following is implemented:
|
||||
- OSD creation tool (OSDs currently have to be created by hand)
|
||||
- Other administrative tools
|
||||
- Per-inode I/O and space usage statistics
|
||||
- jerasure EC support with any number of data and parity drives in a group
|
||||
- Parallel usage of multiple network interfaces
|
||||
- Proxmox and OpenNebula plugins
|
||||
- iSCSI proxy
|
||||
- Inode metadata storage in etcd
|
||||
@@ -50,6 +49,7 @@ breaking changes in the future. However, the following is implemented:
|
||||
- Checksums
|
||||
- SSD+HDD optimizations, possibly including tiered storage and soft journal flushes
|
||||
- RDMA and NVDIMM support
|
||||
- Web GUI
|
||||
- Compression (possibly)
|
||||
- Read caching using system page cache (possibly)
|
||||
|
||||
@@ -353,6 +353,7 @@ and calculate disk offsets almost by hand. This will be fixed in near future.
|
||||
- Create global configuration in etcd: `etcdctl --endpoints=... put /vitastor/config/global '{"immediate_commit":"all"}'`
|
||||
(if all your drives have capacitors).
|
||||
- Create pool configuration in etcd: `etcdctl --endpoints=... put /vitastor/config/pools '{"1":{"name":"testpool","scheme":"replicated","pg_size":2,"pg_minsize":1,"pg_count":256,"failure_domain":"host"}}'`.
|
||||
For jerasure pools the configuration should look like the following: `2:{"name":"ecpool","scheme":"jerasure","pg_size":4,"parity_chunks":2,"pg_minsize":2,"pg_count":256,"failure_domain":"host"}`.
|
||||
- Calculate offsets for your drives with `node /usr/lib/vitastor/mon/simple-offsets.js --device /dev/sdX`.
|
||||
- Make systemd units for your OSDs. Look at `/usr/lib/vitastor/mon/make-units.sh` for example.
|
||||
Notable configuration variables from the example:
|
||||
@@ -398,10 +399,7 @@ and calculate disk offsets almost by hand. This will be fixed in near future.
|
||||
|
||||
- Object deletion requests may currently lead to 'incomplete' objects if your OSDs crash during
|
||||
deletion because proper handling of object cleanup in a cluster should be "three-phase"
|
||||
and it's currently not implemented. Inode removal tool currently can't handle unclean
|
||||
objects, so incomplete objects become undeletable. This will be fixed in near future
|
||||
by allowing the inode removal tool to delete unclean objects. With this problem fixed
|
||||
you'll be able just to repeat the removal again.
|
||||
and it's currently not implemented. Just to repeat the removal again in this case.
|
||||
|
||||
## Implementation Principles
|
||||
|
||||
|
2
debian/control
vendored
2
debian/control
vendored
@@ -9,7 +9,7 @@ Rules-Requires-Root: no
|
||||
|
||||
Package: vitastor
|
||||
Architecture: amd64
|
||||
Depends: ${shlibs:Depends}, ${misc:Depends}, fio (= ${dep:fio}), qemu (= ${dep:qemu}), nodejs (>= 10), node-sprintf-js, node-ws (>= 7), libjerasure2
|
||||
Depends: ${shlibs:Depends}, ${misc:Depends}, fio (= ${dep:fio}), qemu (= ${dep:qemu}), nodejs (>= 10), node-sprintf-js, node-ws (>= 7), libjerasure2, lp-solve
|
||||
Description: Vitastor, a fast software-defined clustered block storage
|
||||
Vitastor is a small, simple and fast clustered block storage (storage for VM drives),
|
||||
architecturally similar to Ceph which means strong consistency, primary-replication,
|
||||
|
109
dump_journal.cpp
109
dump_journal.cpp
@@ -26,23 +26,32 @@ struct journal_dump_t
|
||||
uint64_t journal_offset;
|
||||
uint64_t journal_len;
|
||||
uint64_t journal_pos;
|
||||
bool all;
|
||||
bool started;
|
||||
int fd;
|
||||
uint32_t crc32_last;
|
||||
|
||||
void dump_block(void *buf);
|
||||
int dump_block(void *buf);
|
||||
};
|
||||
|
||||
int main(int argc, char *argv[])
|
||||
{
|
||||
if (argc < 5)
|
||||
journal_dump_t self = { 0 };
|
||||
int b = 1;
|
||||
if (argc >= 2 && !strcmp(argv[1], "--all"))
|
||||
{
|
||||
printf("USAGE: %s <journal_file> <journal_block_size> <offset> <size>\n", argv[0]);
|
||||
self.all = true;
|
||||
b = 2;
|
||||
}
|
||||
if (argc < b+4)
|
||||
{
|
||||
printf("USAGE: %s [--all] <journal_file> <journal_block_size> <offset> <size>\n", argv[0]);
|
||||
return 1;
|
||||
}
|
||||
journal_dump_t self;
|
||||
self.journal_device = argv[1];
|
||||
self.journal_block = strtoul(argv[2], NULL, 10);
|
||||
self.journal_offset = strtoull(argv[3], NULL, 10);
|
||||
self.journal_len = strtoull(argv[4], NULL, 10);
|
||||
self.journal_device = argv[b];
|
||||
self.journal_block = strtoul(argv[b+1], NULL, 10);
|
||||
self.journal_offset = strtoull(argv[b+2], NULL, 10);
|
||||
self.journal_len = strtoull(argv[b+3], NULL, 10);
|
||||
if (self.journal_block < MEM_ALIGNMENT || (self.journal_block % MEM_ALIGNMENT) ||
|
||||
self.journal_block > 128*1024)
|
||||
{
|
||||
@@ -57,30 +66,64 @@ int main(int argc, char *argv[])
|
||||
}
|
||||
void *data = memalign(MEM_ALIGNMENT, self.journal_block);
|
||||
self.journal_pos = 0;
|
||||
while (self.journal_pos < self.journal_len)
|
||||
if (self.all)
|
||||
{
|
||||
while (self.journal_pos < self.journal_len)
|
||||
{
|
||||
int r = pread(self.fd, data, self.journal_block, self.journal_offset+self.journal_pos);
|
||||
assert(r == self.journal_block);
|
||||
uint64_t s;
|
||||
for (s = 0; s < self.journal_block; s += 8)
|
||||
{
|
||||
if (*((uint64_t*)(data+s)) != 0)
|
||||
break;
|
||||
}
|
||||
if (s == self.journal_block)
|
||||
{
|
||||
printf("offset %08lx: zeroes\n", self.journal_pos);
|
||||
self.journal_pos += self.journal_block;
|
||||
}
|
||||
else if (((journal_entry*)data)->magic == JOURNAL_MAGIC)
|
||||
{
|
||||
printf("offset %08lx:\n", self.journal_pos);
|
||||
self.dump_block(data);
|
||||
}
|
||||
else
|
||||
{
|
||||
printf("offset %08lx: no magic in the beginning, looks like random data (pattern=%lx)\n", self.journal_pos, *((uint64_t*)data));
|
||||
self.journal_pos += self.journal_block;
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
int r = pread(self.fd, data, self.journal_block, self.journal_offset+self.journal_pos);
|
||||
assert(r == self.journal_block);
|
||||
uint64_t s;
|
||||
for (s = 0; s < self.journal_block; s += 8)
|
||||
journal_entry *je = (journal_entry*)(data);
|
||||
if (je->magic != JOURNAL_MAGIC || je->type != JE_START || je_crc32(je) != je->crc32)
|
||||
{
|
||||
if (*((uint64_t*)(data+s)) != 0)
|
||||
break;
|
||||
}
|
||||
if (s == self.journal_block)
|
||||
{
|
||||
printf("offset %08lx: zeroes\n", self.journal_pos);
|
||||
self.journal_pos += self.journal_block;
|
||||
}
|
||||
else if (((journal_entry*)data)->magic == JOURNAL_MAGIC)
|
||||
{
|
||||
printf("offset %08lx:\n", self.journal_pos);
|
||||
self.dump_block(data);
|
||||
printf("offset %08lx: journal superblock is invalid\n", self.journal_pos);
|
||||
}
|
||||
else
|
||||
{
|
||||
printf("offset %08lx: no magic in the beginning, looks like random data (pattern=%lx)\n", self.journal_pos, *((uint64_t*)data));
|
||||
self.journal_pos += self.journal_block;
|
||||
printf("offset %08lx:\n", self.journal_pos);
|
||||
self.dump_block(data);
|
||||
self.started = false;
|
||||
self.journal_pos = je->start.journal_start;
|
||||
while (1)
|
||||
{
|
||||
if (self.journal_pos >= self.journal_len)
|
||||
self.journal_pos = self.journal_block;
|
||||
r = pread(self.fd, data, self.journal_block, self.journal_offset+self.journal_pos);
|
||||
assert(r == self.journal_block);
|
||||
printf("offset %08lx:\n", self.journal_pos);
|
||||
r = self.dump_block(data);
|
||||
if (r <= 0)
|
||||
{
|
||||
printf("end of the journal\n");
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
free(data);
|
||||
@@ -88,7 +131,7 @@ int main(int argc, char *argv[])
|
||||
return 0;
|
||||
}
|
||||
|
||||
void journal_dump_t::dump_block(void *buf)
|
||||
int journal_dump_t::dump_block(void *buf)
|
||||
{
|
||||
uint32_t pos = 0;
|
||||
journal_pos += journal_block;
|
||||
@@ -97,12 +140,19 @@ void journal_dump_t::dump_block(void *buf)
|
||||
while (pos < journal_block)
|
||||
{
|
||||
journal_entry *je = (journal_entry*)(buf + pos);
|
||||
if (je->magic != JOURNAL_MAGIC || je->type < JE_MIN || je->type > JE_MAX)
|
||||
if (je->magic != JOURNAL_MAGIC || je->type < JE_MIN || je->type > JE_MAX ||
|
||||
!all && started && je->crc32_prev != crc32_last)
|
||||
{
|
||||
break;
|
||||
}
|
||||
const char *crc32_valid = je_crc32(je) == je->crc32 ? "(valid)" : "(invalid)";
|
||||
printf("entry % 3d: crc32=%08x %s prev=%08x ", entry, je->crc32, crc32_valid, je->crc32_prev);
|
||||
bool crc32_valid = je_crc32(je) == je->crc32;
|
||||
if (!all && !crc32_valid)
|
||||
{
|
||||
break;
|
||||
}
|
||||
started = true;
|
||||
crc32_last = je->crc32;
|
||||
printf("entry % 3d: crc32=%08x %s prev=%08x ", entry, je->crc32, (crc32_valid ? "(valid)" : "(invalid)"), je->crc32_prev);
|
||||
if (je->type == JE_START)
|
||||
{
|
||||
printf("je_start start=%08lx\n", je->start.journal_start);
|
||||
@@ -170,4 +220,5 @@ void journal_dump_t::dump_block(void *buf)
|
||||
{
|
||||
journal_pos = journal_len;
|
||||
}
|
||||
return entry;
|
||||
}
|
||||
|
@@ -84,8 +84,12 @@ void epoll_manager_t::handle_epoll_events()
|
||||
nfds = epoll_wait(epoll_fd, events, MAX_EPOLL_EVENTS, 0);
|
||||
for (int i = 0; i < nfds; i++)
|
||||
{
|
||||
auto & cb = epoll_handlers[events[i].data.fd];
|
||||
cb(events[i].data.fd, events[i].events);
|
||||
auto cb_it = epoll_handlers.find(events[i].data.fd);
|
||||
if (cb_it != epoll_handlers.end())
|
||||
{
|
||||
auto & cb = cb_it->second;
|
||||
cb(events[i].data.fd, events[i].events);
|
||||
}
|
||||
}
|
||||
} while (nfds == MAX_EPOLL_EVENTS);
|
||||
}
|
||||
|
@@ -348,7 +348,14 @@ void osd_messenger_t::stop_client(int peer_fd)
|
||||
}
|
||||
if (cl->read_op)
|
||||
{
|
||||
delete cl->read_op;
|
||||
if (cl->read_op->callback)
|
||||
{
|
||||
cancel_op(cl->read_op);
|
||||
}
|
||||
else
|
||||
{
|
||||
delete cl->read_op;
|
||||
}
|
||||
cl->read_op = NULL;
|
||||
}
|
||||
for (auto rit = read_ready_clients.begin(); rit != read_ready_clients.end(); rit++)
|
||||
|
@@ -94,7 +94,7 @@ struct pg_t
|
||||
std::vector<osd_num_t> cur_set;
|
||||
// same thing in state_dict-like format
|
||||
pg_osd_set_t cur_loc_set;
|
||||
// moved object map. by default, each object is considered to reside on the cur_set.
|
||||
// moved object map. by default, each object is considered to reside on cur_set.
|
||||
// this map stores all objects that differ.
|
||||
// it may consume up to ~ (raw storage / object size) * 24 bytes in the worst case scenario
|
||||
// which is up to ~192 MB per 1 TB in the worst case scenario
|
||||
|
12
osd_rmw.cpp
12
osd_rmw.cpp
@@ -11,6 +11,8 @@
|
||||
#include "osd_rmw.h"
|
||||
#include "malloc_or_die.h"
|
||||
|
||||
#define OSD_JERASURE_W 32
|
||||
|
||||
static inline void extend_read(uint32_t start, uint32_t end, osd_rmw_stripe_t & stripe)
|
||||
{
|
||||
if (stripe.read_end == 0)
|
||||
@@ -158,7 +160,7 @@ void use_jerasure(int pg_size, int pg_minsize, bool use)
|
||||
{
|
||||
return;
|
||||
}
|
||||
int *matrix = reed_sol_vandermonde_coding_matrix(pg_minsize, pg_size-pg_minsize, 32);
|
||||
int *matrix = reed_sol_vandermonde_coding_matrix(pg_minsize, pg_size-pg_minsize, OSD_JERASURE_W);
|
||||
matrices[key] = (reed_sol_matrix_t){
|
||||
.refs = 0,
|
||||
.data = matrix,
|
||||
@@ -214,8 +216,8 @@ int* get_jerasure_decoding_matrix(osd_rmw_stripe_t *stripes, int pg_size, int pg
|
||||
int *decoding_matrix = dm_ids + pg_minsize;
|
||||
if (!dm_ids)
|
||||
throw std::bad_alloc();
|
||||
// we always use row_k_ones=1 and w=32
|
||||
if (jerasure_make_decoding_matrix(pg_minsize, pg_size-pg_minsize, 32, matrix->data, erased, decoding_matrix, dm_ids) < 0)
|
||||
// we always use row_k_ones=1 and w=8 (OSD_JERASURE_W)
|
||||
if (jerasure_make_decoding_matrix(pg_minsize, pg_size-pg_minsize, OSD_JERASURE_W, matrix->data, erased, decoding_matrix, dm_ids) < 0)
|
||||
{
|
||||
free(dm_ids);
|
||||
throw std::runtime_error("jerasure_make_decoding_matrix() failed");
|
||||
@@ -252,7 +254,7 @@ void reconstruct_stripes_jerasure(osd_rmw_stripe_t *stripes, int pg_size, int pg
|
||||
}
|
||||
data_ptrs[role] = (char*)stripes[role].read_buf;
|
||||
jerasure_matrix_dotprod(
|
||||
pg_minsize, 32, decoding_matrix+(role*pg_minsize), dm_ids, role,
|
||||
pg_minsize, OSD_JERASURE_W, decoding_matrix+(role*pg_minsize), dm_ids, role,
|
||||
data_ptrs, data_ptrs+pg_minsize, stripes[role].read_end - stripes[role].read_start
|
||||
);
|
||||
}
|
||||
@@ -694,7 +696,7 @@ void calc_rmw_parity_jerasure(osd_rmw_stripe_t *stripes, int pg_size, int pg_min
|
||||
}
|
||||
}
|
||||
jerasure_matrix_encode(
|
||||
pg_minsize, pg_size-pg_minsize, 32, matrix->data,
|
||||
pg_minsize, pg_size-pg_minsize, OSD_JERASURE_W, matrix->data,
|
||||
(char**)data_ptrs, (char**)data_ptrs+pg_minsize, next_end-pos
|
||||
);
|
||||
pos = next_end;
|
||||
|
203
rm_inode.cpp
203
rm_inode.cpp
@@ -6,26 +6,38 @@
|
||||
* May be included into a bigger "command-line management interface" in the future
|
||||
*/
|
||||
|
||||
#include <vector>
|
||||
#include <algorithm>
|
||||
|
||||
#include "epoll_manager.h"
|
||||
#include "cluster_client.h"
|
||||
#include "pg_states.h"
|
||||
|
||||
#define RM_NO_LIST 1
|
||||
#define RM_LIST_SENT 2
|
||||
#define RM_REMOVING 3
|
||||
#define RM_END 4
|
||||
#define RM_LISTING 1
|
||||
#define RM_REMOVING 2
|
||||
#define RM_END 3
|
||||
|
||||
const char *exe_name = NULL;
|
||||
|
||||
struct rm_pg_t;
|
||||
|
||||
struct rm_pg_osd_t
|
||||
{
|
||||
pg_num_t pg_num;
|
||||
rm_pg_t *pg = NULL;
|
||||
osd_num_t osd_num;
|
||||
bool sent = false;
|
||||
};
|
||||
|
||||
struct rm_pg_t
|
||||
{
|
||||
pg_num_t pg_num;
|
||||
osd_num_t rm_osd_num;
|
||||
std::vector<rm_pg_osd_t> list_osds;
|
||||
int state = 0;
|
||||
obj_ver_id *obj_list = NULL;
|
||||
uint64_t obj_count = 0, obj_pos = 0, obj_done = 0, obj_prev_done = 0;
|
||||
int to_list;
|
||||
std::set<object_id> objects;
|
||||
std::set<object_id>::iterator obj_pos;
|
||||
uint64_t obj_count = 0, obj_done = 0, obj_prev_done = 0;
|
||||
int in_flight = 0;
|
||||
};
|
||||
|
||||
@@ -41,11 +53,12 @@ protected:
|
||||
cluster_client_t *cli = NULL;
|
||||
ring_consumer_t consumer;
|
||||
|
||||
std::vector<rm_pg_osd_t*> lists;
|
||||
std::vector<rm_pg_t*> lists;
|
||||
uint64_t total_count = 0, total_done = 0, total_prev_pct = 0;
|
||||
uint64_t pgs_to_list = 0;
|
||||
bool started = false;
|
||||
bool progress = true;
|
||||
bool list_first = false;
|
||||
int log_level = 0;
|
||||
|
||||
public:
|
||||
@@ -62,7 +75,7 @@ public:
|
||||
else if (args[i][0] == '-' && args[i][1] == '-')
|
||||
{
|
||||
const char *opt = args[i]+2;
|
||||
cfg[opt] = !strcmp(opt, "json") || i == narg-1 ? "1" : args[++i];
|
||||
cfg[opt] = !strcmp(opt, "json") || !strcmp(opt, "wait-list") || i == narg-1 ? "1" : args[++i];
|
||||
}
|
||||
}
|
||||
return cfg;
|
||||
@@ -74,7 +87,7 @@ public:
|
||||
"Vitastor inode removal tool\n"
|
||||
"(c) Vitaliy Filippov, 2020 (VNPL-1.0)\n\n"
|
||||
"USAGE:\n"
|
||||
" %s --etcd_address <etcd_address> --pool <pool> --inode <inode>\n",
|
||||
" %s --etcd_address <etcd_address> --pool <pool> --inode <inode> [--wait-list]\n",
|
||||
exe_name
|
||||
);
|
||||
exit(0);
|
||||
@@ -105,6 +118,7 @@ public:
|
||||
parallel_osds = 4;
|
||||
log_level = cfg["log_level"].int64_value();
|
||||
progress = cfg["progress"].uint64_value() ? true : false;
|
||||
list_first = cfg["wait-list"].uint64_value() ? true : false;
|
||||
// Create client
|
||||
ringloop = new ring_loop_t(512);
|
||||
epmgr = new epoll_manager_t(ringloop);
|
||||
@@ -137,21 +151,57 @@ public:
|
||||
for (auto & pg_item: pool_cfg.pg_config)
|
||||
{
|
||||
auto & pg = pg_item.second;
|
||||
if (pg.pause || !pg.cur_primary || pg.cur_state != PG_ACTIVE)
|
||||
if (pg.pause || !pg.cur_primary || !(pg.cur_state & PG_ACTIVE))
|
||||
{
|
||||
// FIXME Support deletion in non-clean active PGs by introducing a "primary-list" command
|
||||
fprintf(stderr, "PG %u is not active+clean, skipping\n", pg_item.first);
|
||||
fprintf(stderr, "PG %u is inactive, skipping\n", pg_item.first);
|
||||
continue;
|
||||
}
|
||||
rm_pg_osd_t *r = new rm_pg_osd_t();
|
||||
rm_pg_t *r = new rm_pg_t();
|
||||
r->pg_num = pg_item.first;
|
||||
r->osd_num = pg.cur_primary;
|
||||
r->state = RM_NO_LIST;
|
||||
r->rm_osd_num = pg.cur_primary;
|
||||
r->state = RM_LISTING;
|
||||
if (pg.cur_state != PG_ACTIVE)
|
||||
{
|
||||
std::set<osd_num_t> all_peers;
|
||||
for (osd_num_t pg_osd: pg.target_set)
|
||||
{
|
||||
if (pg_osd != 0)
|
||||
{
|
||||
all_peers.insert(pg_osd);
|
||||
}
|
||||
}
|
||||
for (osd_num_t pg_osd: pg.all_peers)
|
||||
{
|
||||
if (pg_osd != 0)
|
||||
{
|
||||
all_peers.insert(pg_osd);
|
||||
}
|
||||
}
|
||||
for (auto & hist_item: pg.target_history)
|
||||
{
|
||||
for (auto pg_osd: hist_item)
|
||||
{
|
||||
if (pg_osd != 0)
|
||||
{
|
||||
all_peers.insert(pg_osd);
|
||||
}
|
||||
}
|
||||
}
|
||||
for (osd_num_t peer_osd: all_peers)
|
||||
{
|
||||
r->list_osds.push_back((rm_pg_osd_t){ .pg = r, .osd_num = peer_osd, .sent = false });
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
r->list_osds.push_back((rm_pg_osd_t){ .pg = r, .osd_num = pg.cur_primary, .sent = false });
|
||||
}
|
||||
r->to_list = r->list_osds.size();
|
||||
lists.push_back(r);
|
||||
}
|
||||
std::sort(lists.begin(), lists.end(), [](rm_pg_osd_t *a, rm_pg_osd_t *b)
|
||||
std::sort(lists.begin(), lists.end(), [](rm_pg_t *a, rm_pg_t *b)
|
||||
{
|
||||
return a->osd_num < b->osd_num ? true : false;
|
||||
return a->rm_osd_num < b->rm_osd_num ? true : false;
|
||||
});
|
||||
pgs_to_list = lists.size();
|
||||
started = true;
|
||||
@@ -160,6 +210,10 @@ public:
|
||||
|
||||
void send_list(rm_pg_osd_t *cur_list)
|
||||
{
|
||||
if (cur_list->sent)
|
||||
{
|
||||
return;
|
||||
}
|
||||
if (cli->msgr.osd_peer_fds.find(cur_list->osd_num) ==
|
||||
cli->msgr.osd_peer_fds.end())
|
||||
{
|
||||
@@ -177,7 +231,7 @@ public:
|
||||
.id = cli->msgr.next_subop_id++,
|
||||
.opcode = OSD_OP_SEC_LIST,
|
||||
},
|
||||
.list_pg = cur_list->pg_num,
|
||||
.list_pg = cur_list->pg->pg_num,
|
||||
.pg_count = (pg_num_t)cli->st_cli.pool_config[pool_id].real_pg_count,
|
||||
.pg_stripe_size = cli->st_cli.pool_config[pool_id].pg_stripe_size,
|
||||
.min_inode = inode,
|
||||
@@ -186,53 +240,67 @@ public:
|
||||
};
|
||||
op->callback = [this, cur_list](osd_op_t *op)
|
||||
{
|
||||
pgs_to_list--;
|
||||
cur_list->pg->to_list--;
|
||||
if (op->reply.hdr.retval < 0)
|
||||
{
|
||||
fprintf(stderr, "Failed to get object list from OSD %lu (retval=%ld), skipping the PG\n",
|
||||
cur_list->osd_num, op->reply.hdr.retval);
|
||||
cli->msgr.stop_client(cur_list->osd_num);
|
||||
delete op;
|
||||
cur_list->state = RM_END;
|
||||
continue_delete();
|
||||
return;
|
||||
fprintf(stderr, "Failed to get PG %u/%u object list from OSD %lu (retval=%ld), skipping\n",
|
||||
pool_id, cur_list->pg->pg_num, cur_list->osd_num, op->reply.hdr.retval);
|
||||
}
|
||||
if (log_level > 0)
|
||||
else
|
||||
{
|
||||
printf(
|
||||
"[PG %u/%u] Got inode object list from OSD %lu: %ld object versions\n",
|
||||
pool_id, cur_list->pg_num, cur_list->osd_num, op->reply.hdr.retval
|
||||
);
|
||||
if (op->reply.sec_list.stable_count < op->reply.hdr.retval)
|
||||
{
|
||||
// Unstable objects, if present, mean that someone still writes into the inode. Warn the user about it.
|
||||
printf(
|
||||
"[PG %u/%u] Inode still has %lu unstable object versions - is it still open? Not a good idea to delete it.\n",
|
||||
pool_id, cur_list->pg->pg_num, op->reply.hdr.retval - op->reply.sec_list.stable_count
|
||||
);
|
||||
}
|
||||
if (log_level > 0)
|
||||
{
|
||||
printf(
|
||||
"[PG %u/%u] Got inode object list from OSD %lu: %ld object versions\n",
|
||||
pool_id, cur_list->pg->pg_num, cur_list->osd_num, op->reply.hdr.retval
|
||||
);
|
||||
}
|
||||
for (uint64_t i = 0; i < op->reply.hdr.retval; i++)
|
||||
{
|
||||
object_id oid = ((obj_ver_id*)op->buf)[i].oid;
|
||||
oid.stripe = oid.stripe & ~STRIPE_MASK;
|
||||
cur_list->pg->objects.insert(oid);
|
||||
}
|
||||
}
|
||||
cur_list->obj_list = (obj_ver_id*)op->buf;
|
||||
cur_list->obj_count = (uint64_t)op->reply.hdr.retval;
|
||||
cur_list->obj_done = cur_list->obj_prev_done = cur_list->obj_pos = 0;
|
||||
total_count += cur_list->obj_count;
|
||||
total_prev_pct = 0;
|
||||
// set op->buf to NULL so it doesn't get freed
|
||||
op->buf = NULL;
|
||||
delete op;
|
||||
cur_list->state = RM_REMOVING;
|
||||
if (cur_list->pg->to_list <= 0)
|
||||
{
|
||||
cur_list->pg->obj_done = cur_list->pg->obj_prev_done = 0;
|
||||
cur_list->pg->obj_pos = cur_list->pg->objects.begin();
|
||||
cur_list->pg->obj_count = cur_list->pg->objects.size();
|
||||
total_count += cur_list->pg->obj_count;
|
||||
total_prev_pct = 0;
|
||||
cur_list->pg->state = RM_REMOVING;
|
||||
pgs_to_list--;
|
||||
}
|
||||
continue_delete();
|
||||
};
|
||||
cur_list->state = RM_LIST_SENT;
|
||||
cli->msgr.outbox_push(op);
|
||||
cur_list->sent = true;
|
||||
}
|
||||
|
||||
void send_ops(rm_pg_osd_t *cur_list)
|
||||
void send_ops(rm_pg_t *cur_list)
|
||||
{
|
||||
if (cli->msgr.osd_peer_fds.find(cur_list->osd_num) ==
|
||||
if (cli->msgr.osd_peer_fds.find(cur_list->rm_osd_num) ==
|
||||
cli->msgr.osd_peer_fds.end())
|
||||
{
|
||||
// Initiate connection
|
||||
cli->msgr.connect_peer(cur_list->osd_num, cli->st_cli.peer_states[cur_list->osd_num]);
|
||||
cli->msgr.connect_peer(cur_list->rm_osd_num, cli->st_cli.peer_states[cur_list->rm_osd_num]);
|
||||
return;
|
||||
}
|
||||
while (cur_list->in_flight < iodepth && cur_list->obj_pos < cur_list->obj_count)
|
||||
while (cur_list->in_flight < iodepth && cur_list->obj_pos != cur_list->objects.end())
|
||||
{
|
||||
osd_op_t *op = new osd_op_t();
|
||||
op->op_type = OSD_OP_OUT;
|
||||
op->peer_fd = cli->msgr.osd_peer_fds[cur_list->osd_num];
|
||||
op->peer_fd = cli->msgr.osd_peer_fds[cur_list->rm_osd_num];
|
||||
op->req = (osd_any_op_t){
|
||||
.rw = {
|
||||
.header = {
|
||||
@@ -240,8 +308,8 @@ public:
|
||||
.id = cli->msgr.next_subop_id++,
|
||||
.opcode = OSD_OP_DELETE,
|
||||
},
|
||||
.inode = cur_list->obj_list[cur_list->obj_pos].oid.inode,
|
||||
.offset = (cur_list->obj_list[cur_list->obj_pos].oid.stripe & ~STRIPE_MASK),
|
||||
.inode = cur_list->obj_pos->inode,
|
||||
.offset = (cur_list->obj_pos->stripe & ~STRIPE_MASK),
|
||||
.len = 0,
|
||||
},
|
||||
};
|
||||
@@ -251,7 +319,7 @@ public:
|
||||
if (op->reply.hdr.retval < 0)
|
||||
{
|
||||
fprintf(stderr, "Failed to remove object from PG %u (OSD %lu) (retval=%ld)\n",
|
||||
cur_list->pg_num, cur_list->osd_num, op->reply.hdr.retval);
|
||||
cur_list->pg_num, cur_list->rm_osd_num, op->reply.hdr.retval);
|
||||
}
|
||||
delete op;
|
||||
cur_list->obj_done++;
|
||||
@@ -262,12 +330,10 @@ public:
|
||||
cur_list->obj_pos++;
|
||||
cur_list->in_flight++;
|
||||
}
|
||||
if (!cur_list->in_flight && cur_list->obj_pos >= cur_list->obj_count)
|
||||
if (!cur_list->in_flight && cur_list->obj_pos == cur_list->objects.end())
|
||||
{
|
||||
free(cur_list->obj_list);
|
||||
cur_list->obj_list = NULL;
|
||||
cur_list->obj_count = 0;
|
||||
cur_list->obj_done = cur_list->obj_prev_done = cur_list->obj_pos = 0;
|
||||
cur_list->obj_done = cur_list->obj_prev_done = 0;
|
||||
cur_list->state = RM_END;
|
||||
}
|
||||
}
|
||||
@@ -276,6 +342,22 @@ public:
|
||||
{
|
||||
int par_osd = 0;
|
||||
osd_num_t max_seen_osd = 0;
|
||||
bool no_del = false;
|
||||
if (list_first)
|
||||
{
|
||||
int i, n = 0;
|
||||
for (i = 0; i < lists.size(); i++)
|
||||
{
|
||||
if (lists[i]->state == RM_LISTING)
|
||||
{
|
||||
n++;
|
||||
}
|
||||
}
|
||||
if (n > 0)
|
||||
{
|
||||
no_del = true;
|
||||
}
|
||||
}
|
||||
for (int i = 0; i < lists.size(); i++)
|
||||
{
|
||||
if (lists[i]->state == RM_END)
|
||||
@@ -284,18 +366,25 @@ public:
|
||||
lists.erase(lists.begin()+i, lists.begin()+i+1);
|
||||
i--;
|
||||
}
|
||||
else if (lists[i]->osd_num > max_seen_osd)
|
||||
else if (lists[i]->rm_osd_num > max_seen_osd)
|
||||
{
|
||||
if (lists[i]->state == RM_NO_LIST)
|
||||
if (lists[i]->state == RM_LISTING)
|
||||
{
|
||||
send_list(lists[i]);
|
||||
for (int j = 0; j < lists[i]->list_osds.size(); j++)
|
||||
{
|
||||
send_list(&lists[i]->list_osds[j]);
|
||||
}
|
||||
}
|
||||
else if (lists[i]->state == RM_REMOVING)
|
||||
{
|
||||
if (no_del)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
send_ops(lists[i]);
|
||||
}
|
||||
par_osd++;
|
||||
max_seen_osd = lists[i]->osd_num;
|
||||
max_seen_osd = lists[i]->rm_osd_num;
|
||||
if (par_osd >= parallel_osds)
|
||||
{
|
||||
break;
|
||||
|
257
rpm/qemu-kvm-el7.spec.patch
Normal file
257
rpm/qemu-kvm-el7.spec.patch
Normal file
@@ -0,0 +1,257 @@
|
||||
--- qemu-kvm.spec.orig 2020-11-09 23:41:03.000000000 +0000
|
||||
+++ qemu-kvm.spec 2020-12-06 10:44:24.207640963 +0000
|
||||
@@ -2,7 +2,7 @@
|
||||
%global SLOF_gittagcommit 899d9883
|
||||
|
||||
%global have_usbredir 1
|
||||
-%global have_spice 1
|
||||
+%global have_spice 0
|
||||
%global have_opengl 1
|
||||
%global have_fdt 0
|
||||
%global have_gluster 1
|
||||
@@ -56,7 +56,7 @@ Requires: %{name}-block-curl = %{epoch}:
|
||||
Requires: %{name}-block-gluster = %{epoch}:%{version}-%{release} \
|
||||
%endif \
|
||||
Requires: %{name}-block-iscsi = %{epoch}:%{version}-%{release} \
|
||||
-Requires: %{name}-block-rbd = %{epoch}:%{version}-%{release} \
|
||||
+#Requires: %{name}-block-rbd = %{epoch}:%{version}-%{release} \
|
||||
Requires: %{name}-block-ssh = %{epoch}:%{version}-%{release}
|
||||
|
||||
# Macro to properly setup RHEL/RHEV conflict handling
|
||||
@@ -67,7 +67,7 @@ Obsoletes: %1-rhev
|
||||
Summary: QEMU is a machine emulator and virtualizer
|
||||
Name: qemu-kvm
|
||||
Version: 4.2.0
|
||||
-Release: 29.vitastor%{?dist}.6
|
||||
+Release: 30.vitastor%{?dist}.6
|
||||
# Epoch because we pushed a qemu-1.0 package. AIUI this can't ever be dropped
|
||||
Epoch: 15
|
||||
License: GPLv2 and GPLv2+ and CC-BY
|
||||
@@ -99,8 +99,8 @@ Source30: kvm-s390x.conf
|
||||
Source31: kvm-x86.conf
|
||||
Source32: qemu-pr-helper.service
|
||||
Source33: qemu-pr-helper.socket
|
||||
-Source34: 81-kvm-rhel.rules
|
||||
-Source35: udev-kvm-check.c
|
||||
+#Source34: 81-kvm-rhel.rules
|
||||
+#Source35: udev-kvm-check.c
|
||||
Source36: README.tests
|
||||
|
||||
|
||||
@@ -825,7 +825,9 @@ Patch331: kvm-Drop-bogus-IPv6-messages.p
|
||||
Patch333: kvm-virtiofsd-Whitelist-fchmod.patch
|
||||
# For bz#1883869 - virtiofsd core dump in KATA Container [rhel-8.2.1.z]
|
||||
Patch334: kvm-virtiofsd-avoid-proc-self-fd-tempdir.patch
|
||||
-Patch335: qemu-4.2-vitastor.patch
|
||||
+Patch335: qemu-use-sphinx-1.2.patch
|
||||
+Patch336: qemu-config-tcmalloc-warning.patch
|
||||
+Patch337: qemu-4.2-vitastor.patch
|
||||
|
||||
BuildRequires: wget
|
||||
BuildRequires: rpm-build
|
||||
@@ -842,7 +844,8 @@ BuildRequires: pciutils-devel
|
||||
BuildRequires: libiscsi-devel
|
||||
BuildRequires: ncurses-devel
|
||||
BuildRequires: libattr-devel
|
||||
-BuildRequires: libusbx-devel >= 1.0.22
|
||||
+BuildRequires: gperftools-devel
|
||||
+BuildRequires: libusbx-devel >= 1.0.21
|
||||
%if %{have_usbredir}
|
||||
BuildRequires: usbredir-devel >= 0.7.1
|
||||
%endif
|
||||
@@ -856,12 +859,12 @@ BuildRequires: virglrenderer-devel
|
||||
# For smartcard NSS support
|
||||
BuildRequires: nss-devel
|
||||
%endif
|
||||
-BuildRequires: libseccomp-devel >= 2.4.0
|
||||
+#Requires: libseccomp >= 2.4.0
|
||||
# For network block driver
|
||||
BuildRequires: libcurl-devel
|
||||
BuildRequires: libssh-devel
|
||||
-BuildRequires: librados-devel
|
||||
-BuildRequires: librbd-devel
|
||||
+#BuildRequires: librados-devel
|
||||
+#BuildRequires: librbd-devel
|
||||
%if %{have_gluster}
|
||||
# For gluster block driver
|
||||
BuildRequires: glusterfs-api-devel
|
||||
@@ -955,25 +958,25 @@ hardware for a full system such as a PC
|
||||
|
||||
%package -n qemu-kvm-core
|
||||
Summary: qemu-kvm core components
|
||||
+Requires: gperftools-libs
|
||||
Requires: qemu-img = %{epoch}:%{version}-%{release}
|
||||
%ifarch %{ix86} x86_64
|
||||
Requires: seabios-bin >= 1.10.2-1
|
||||
Requires: sgabios-bin
|
||||
-Requires: edk2-ovmf
|
||||
%endif
|
||||
%ifarch aarch64
|
||||
Requires: edk2-aarch64
|
||||
%endif
|
||||
|
||||
%ifnarch aarch64 s390x
|
||||
-Requires: seavgabios-bin >= 1.12.0-3
|
||||
-Requires: ipxe-roms-qemu >= 20170123-1
|
||||
+Requires: seavgabios-bin >= 1.11.0-1
|
||||
+Requires: ipxe-roms-qemu >= 20181214-1
|
||||
+Requires: /usr/share/ipxe.efi
|
||||
%endif
|
||||
%ifarch %{power64}
|
||||
Requires: SLOF >= %{SLOF_gittagdate}-1.git%{SLOF_gittagcommit}
|
||||
%endif
|
||||
Requires: %{name}-common = %{epoch}:%{version}-%{release}
|
||||
-Requires: libseccomp >= 2.4.0
|
||||
# For compressed guest memory dumps
|
||||
Requires: lzo snappy
|
||||
%if %{have_kvm_setup}
|
||||
@@ -1085,15 +1088,15 @@ This package provides the additional iSC
|
||||
Install this package if you want to access iSCSI volumes.
|
||||
|
||||
|
||||
-%package block-rbd
|
||||
-Summary: QEMU Ceph/RBD block driver
|
||||
-Requires: %{name}-common%{?_isa} = %{epoch}:%{version}-%{release}
|
||||
-
|
||||
-%description block-rbd
|
||||
-This package provides the additional Ceph/RBD block driver for QEMU.
|
||||
-
|
||||
-Install this package if you want to access remote Ceph volumes
|
||||
-using the rbd protocol.
|
||||
+#%package block-rbd
|
||||
+#Summary: QEMU Ceph/RBD block driver
|
||||
+#Requires: %{name}-common%{?_isa} = %{epoch}:%{version}-%{release}
|
||||
+#
|
||||
+#%description block-rbd
|
||||
+#This package provides the additional Ceph/RBD block driver for QEMU.
|
||||
+#
|
||||
+#Install this package if you want to access remote Ceph volumes
|
||||
+#using the rbd protocol.
|
||||
|
||||
|
||||
%package block-ssh
|
||||
@@ -1117,12 +1120,14 @@ the Secure Shell (SSH) protocol.
|
||||
# --build-id option is used for giving info to the debug packages.
|
||||
buildldflags="VL_LDFLAGS=-Wl,--build-id"
|
||||
|
||||
-%global block_drivers_list qcow2,raw,file,host_device,nbd,iscsi,rbd,blkdebug,luks,null-co,nvme,copy-on-read,throttle
|
||||
+#%global block_drivers_list qcow2,raw,file,host_device,nbd,iscsi,rbd,blkdebug,luks,null-co,nvme,copy-on-read,throttle
|
||||
+%global block_drivers_list qcow2,raw,file,host_device,nbd,iscsi,blkdebug,luks,null-co,nvme,copy-on-read,throttle
|
||||
|
||||
%if 0%{have_gluster}
|
||||
%global block_drivers_list %{block_drivers_list},gluster
|
||||
%endif
|
||||
|
||||
+[ -e /usr/bin/sphinx-build ] || ln -s sphinx-build-3 /usr/bin/sphinx-build
|
||||
./configure \
|
||||
--prefix="%{_prefix}" \
|
||||
--libdir="%{_libdir}" \
|
||||
@@ -1152,15 +1157,15 @@ buildldflags="VL_LDFLAGS=-Wl,--build-id"
|
||||
%else
|
||||
--disable-numa \
|
||||
%endif
|
||||
- --enable-rbd \
|
||||
+ --disable-rbd \
|
||||
%if 0%{have_librdma}
|
||||
--enable-rdma \
|
||||
%else
|
||||
--disable-rdma \
|
||||
%endif
|
||||
--disable-pvrdma \
|
||||
- --enable-seccomp \
|
||||
-%if 0%{have_spice}
|
||||
+ --disable-seccomp \
|
||||
+%if %{have_spice}
|
||||
--enable-spice \
|
||||
--enable-smartcard \
|
||||
--enable-virglrenderer \
|
||||
@@ -1179,7 +1184,7 @@ buildldflags="VL_LDFLAGS=-Wl,--build-id"
|
||||
%else
|
||||
--disable-usb-redir \
|
||||
%endif
|
||||
- --disable-tcmalloc \
|
||||
+ --enable-tcmalloc \
|
||||
%ifarch x86_64
|
||||
--enable-libpmem \
|
||||
%else
|
||||
@@ -1193,9 +1198,7 @@ buildldflags="VL_LDFLAGS=-Wl,--build-id"
|
||||
%endif
|
||||
--python=%{__python3} \
|
||||
--target-list="%{buildarch}" \
|
||||
- --block-drv-rw-whitelist=%{block_drivers_list} \
|
||||
--audio-drv-list= \
|
||||
- --block-drv-ro-whitelist=vmdk,vhdx,vpc,https,ssh \
|
||||
--with-coroutine=ucontext \
|
||||
--tls-priority=NORMAL \
|
||||
--disable-bluez \
|
||||
@@ -1262,7 +1265,7 @@ buildldflags="VL_LDFLAGS=-Wl,--build-id"
|
||||
--disable-sanitizers \
|
||||
--disable-hvf \
|
||||
--disable-whpx \
|
||||
- --enable-malloc-trim \
|
||||
+ --disable-malloc-trim \
|
||||
--disable-membarrier \
|
||||
--disable-vhost-crypto \
|
||||
--disable-libxml2 \
|
||||
@@ -1308,7 +1311,7 @@ make V=1 %{?_smp_mflags} $buildldflags
|
||||
cp -a %{kvm_target}-softmmu/qemu-system-%{kvm_target} qemu-kvm
|
||||
|
||||
gcc %{SOURCE6} $RPM_OPT_FLAGS $RPM_LD_FLAGS -o ksmctl
|
||||
-gcc %{SOURCE35} $RPM_OPT_FLAGS $RPM_LD_FLAGS -o udev-kvm-check
|
||||
+#gcc %{SOURCE35} $RPM_OPT_FLAGS $RPM_LD_FLAGS -o udev-kvm-check
|
||||
|
||||
%install
|
||||
%define _udevdir %(pkg-config --variable=udevdir udev)
|
||||
@@ -1343,8 +1346,8 @@ mkdir -p $RPM_BUILD_ROOT%{testsdir}/test
|
||||
mkdir -p $RPM_BUILD_ROOT%{testsdir}/tests/qemu-iotests
|
||||
mkdir -p $RPM_BUILD_ROOT%{testsdir}/scripts/qmp
|
||||
|
||||
-install -p -m 0755 udev-kvm-check $RPM_BUILD_ROOT%{_udevdir}
|
||||
-install -p -m 0644 %{SOURCE34} $RPM_BUILD_ROOT%{_udevrulesdir}
|
||||
+#install -p -m 0755 udev-kvm-check $RPM_BUILD_ROOT%{_udevdir}
|
||||
+#install -p -m 0644 %{SOURCE34} $RPM_BUILD_ROOT%{_udevrulesdir}
|
||||
|
||||
install -m 0644 scripts/dump-guest-memory.py \
|
||||
$RPM_BUILD_ROOT%{_datadir}/%{name}
|
||||
@@ -1562,6 +1565,8 @@ rm -rf $RPM_BUILD_ROOT%{qemudocdir}/inte
|
||||
# Remove spec
|
||||
rm -rf $RPM_BUILD_ROOT%{qemudocdir}/specs
|
||||
|
||||
+%global __os_install_post %(echo '%{__os_install_post}' | sed -e 's!/usr/lib[^[:space:]]*/brp-python-bytecompile[[:space:]].*$!!g')
|
||||
+
|
||||
%check
|
||||
export DIFF=diff; make check V=1
|
||||
|
||||
@@ -1645,8 +1650,8 @@ useradd -r -u 107 -g qemu -G kvm -d / -s
|
||||
%config(noreplace) %{_sysconfdir}/sysconfig/ksm
|
||||
%{_unitdir}/ksmtuned.service
|
||||
%{_sbindir}/ksmtuned
|
||||
-%{_udevdir}/udev-kvm-check
|
||||
-%{_udevrulesdir}/81-kvm-rhel.rules
|
||||
+#%{_udevdir}/udev-kvm-check
|
||||
+#%{_udevrulesdir}/81-kvm-rhel.rules
|
||||
%ghost %{_sysconfdir}/kvm
|
||||
%config(noreplace) %{_sysconfdir}/ksmtuned.conf
|
||||
%dir %{_sysconfdir}/%{name}
|
||||
@@ -1711,8 +1716,8 @@ useradd -r -u 107 -g qemu -G kvm -d / -s
|
||||
%{_libexecdir}/vhost-user-gpu
|
||||
%{_datadir}/%{name}/vhost-user/50-qemu-gpu.json
|
||||
%endif
|
||||
-%{_libexecdir}/virtiofsd
|
||||
-%{_datadir}/%{name}/vhost-user/50-qemu-virtiofsd.json
|
||||
+#%{_libexecdir}/virtiofsd
|
||||
+#%{_datadir}/%{name}/vhost-user/50-qemu-virtiofsd.json
|
||||
|
||||
%files -n qemu-img
|
||||
%defattr(-,root,root)
|
||||
@@ -1748,8 +1753,8 @@ useradd -r -u 107 -g qemu -G kvm -d / -s
|
||||
%files block-iscsi
|
||||
%{_libdir}/qemu-kvm/block-iscsi.so
|
||||
|
||||
-%files block-rbd
|
||||
-%{_libdir}/qemu-kvm/block-rbd.so
|
||||
+#%files block-rbd
|
||||
+#%{_libdir}/qemu-kvm/block-rbd.so
|
||||
|
||||
%files block-ssh
|
||||
%{_libdir}/qemu-kvm/block-ssh.so
|
@@ -20,6 +20,7 @@ Requires: rh-nodejs12
|
||||
Requires: rh-nodejs12-npm
|
||||
Requires: liburing >= 0.6
|
||||
Requires: libJerasure2
|
||||
Requires: lpsolve
|
||||
|
||||
%description
|
||||
Vitastor is a small, simple and fast clustered block storage (storage for VM drives),
|
||||
|
@@ -18,6 +18,7 @@ Requires: qemu-kvm = 4.2.0-29.el8.6
|
||||
Requires: nodejs >= 10
|
||||
Requires: liburing >= 0.6
|
||||
Requires: libJerasure2
|
||||
Requires: lpsolve
|
||||
|
||||
%description
|
||||
Vitastor is a small, simple and fast clustered block storage (storage for VM drives),
|
||||
|
Reference in New Issue
Block a user