Compare commits
6 Commits
v0.7.1
...
lrc-matrix
Author | SHA1 | Date | |
---|---|---|---|
42ae8f91ee | |||
dac12d8a4c | |||
1eec4407ab | |||
![]() |
3b7c6dcac2 | ||
342517d126 | |||
675bc12a13 |
1
debian/vitastor-osd.install
vendored
1
debian/vitastor-osd.install
vendored
@@ -1,2 +1,3 @@
|
|||||||
usr/bin/vitastor-osd
|
usr/bin/vitastor-osd
|
||||||
usr/bin/vitastor-dump-journal
|
usr/bin/vitastor-dump-journal
|
||||||
|
usr/bin/vitastor-dump-meta
|
||||||
|
@@ -110,7 +110,7 @@
|
|||||||
|
|
||||||
Схема избыточности, используемая в данном пуле. "jerasure" - синоним для "ec",
|
Схема избыточности, используемая в данном пуле. "jerasure" - синоним для "ec",
|
||||||
в обеих схемах используются коды Рида-Соломона-Вандермонда, реализованные на
|
в обеих схемах используются коды Рида-Соломона-Вандермонда, реализованные на
|
||||||
основе библиотек ISA-L или jerasure. Быстрая реализацяю на основе ISA-L
|
основе библиотек ISA-L или jerasure. Быстрая реализация на основе ISA-L
|
||||||
используется автоматически, когда доступна, в противном случае используется
|
используется автоматически, когда доступна, в противном случае используется
|
||||||
более медленная jerasure-версия.
|
более медленная jerasure-версия.
|
||||||
|
|
||||||
|
@@ -464,7 +464,7 @@ class VitastorDriver(driver.CloneableImageVD,
|
|||||||
vol_name = utils.convert_str(volume.name)
|
vol_name = utils.convert_str(volume.name)
|
||||||
snap_name = utils.convert_str(snapshot.name)
|
snap_name = utils.convert_str(snapshot.name)
|
||||||
|
|
||||||
snap = self._get_image(vol_name+'@'+snap_name)
|
snap = self._get_image('volume-'+snapshot.volume_id+'@'+snap_name)
|
||||||
if not snap:
|
if not snap:
|
||||||
raise exception.SnapshotNotFound(snapshot_id = snap_name)
|
raise exception.SnapshotNotFound(snapshot_id = snap_name)
|
||||||
snap_inode_id = int(resp['responses'][0]['kvs'][0]['value']['id'])
|
snap_inode_id = int(resp['responses'][0]['kvs'][0]['value']['id'])
|
||||||
|
@@ -113,6 +113,7 @@ cp -r mon %buildroot/usr/lib/vitastor
|
|||||||
%files -n vitastor-osd
|
%files -n vitastor-osd
|
||||||
%_bindir/vitastor-osd
|
%_bindir/vitastor-osd
|
||||||
%_bindir/vitastor-dump-journal
|
%_bindir/vitastor-dump-journal
|
||||||
|
%_bindir/vitastor-dump-meta
|
||||||
|
|
||||||
|
|
||||||
%files -n vitastor-mon
|
%files -n vitastor-mon
|
||||||
|
@@ -110,6 +110,7 @@ cp -r mon %buildroot/usr/lib/vitastor
|
|||||||
%files -n vitastor-osd
|
%files -n vitastor-osd
|
||||||
%_bindir/vitastor-osd
|
%_bindir/vitastor-osd
|
||||||
%_bindir/vitastor-dump-journal
|
%_bindir/vitastor-dump-journal
|
||||||
|
%_bindir/vitastor-dump-meta
|
||||||
|
|
||||||
|
|
||||||
%files -n vitastor-mon
|
%files -n vitastor-mon
|
||||||
|
@@ -198,6 +198,11 @@ add_executable(vitastor-dump-journal
|
|||||||
dump_journal.cpp crc32c.c
|
dump_journal.cpp crc32c.c
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# vitastor-dump-meta
|
||||||
|
add_executable(vitastor-dump-meta
|
||||||
|
dump_meta.cpp
|
||||||
|
)
|
||||||
|
|
||||||
if (${WITH_QEMU})
|
if (${WITH_QEMU})
|
||||||
# qemu_driver.so
|
# qemu_driver.so
|
||||||
add_library(qemu_vitastor SHARED
|
add_library(qemu_vitastor SHARED
|
||||||
@@ -275,7 +280,7 @@ target_include_directories(test_cluster_client PUBLIC ${CMAKE_SOURCE_DIR}/src/mo
|
|||||||
|
|
||||||
### Install
|
### Install
|
||||||
|
|
||||||
install(TARGETS vitastor-osd vitastor-dump-journal vitastor-nbd vitastor-nfs vitastor-cli RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
|
install(TARGETS vitastor-osd vitastor-dump-journal vitastor-dump-meta vitastor-nbd vitastor-nfs vitastor-cli RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
|
||||||
install_symlink(vitastor-cli ${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_BINDIR}/vitastor-rm)
|
install_symlink(vitastor-cli ${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_BINDIR}/vitastor-rm)
|
||||||
install_symlink(vitastor-cli ${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_BINDIR}/vita)
|
install_symlink(vitastor-cli ${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_BINDIR}/vita)
|
||||||
install(
|
install(
|
||||||
|
@@ -19,7 +19,11 @@
|
|||||||
#include "timerfd_manager.h"
|
#include "timerfd_manager.h"
|
||||||
|
|
||||||
// Memory alignment for direct I/O (usually 512 bytes)
|
// Memory alignment for direct I/O (usually 512 bytes)
|
||||||
// All other alignments must be a multiple of this one
|
#ifndef DIRECT_IO_ALIGNMENT
|
||||||
|
#define DIRECT_IO_ALIGNMENT 512
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// Memory allocation alignment (page size is usually optimal)
|
||||||
#ifndef MEM_ALIGNMENT
|
#ifndef MEM_ALIGNMENT
|
||||||
#define MEM_ALIGNMENT 4096
|
#define MEM_ALIGNMENT 4096
|
||||||
#endif
|
#endif
|
||||||
|
@@ -109,25 +109,25 @@ void blockstore_impl_t::parse_config(blockstore_config_t & config)
|
|||||||
{
|
{
|
||||||
disk_alignment = 4096;
|
disk_alignment = 4096;
|
||||||
}
|
}
|
||||||
else if (disk_alignment % MEM_ALIGNMENT)
|
else if (disk_alignment % DIRECT_IO_ALIGNMENT)
|
||||||
{
|
{
|
||||||
throw std::runtime_error("disk_alignment must be a multiple of "+std::to_string(MEM_ALIGNMENT));
|
throw std::runtime_error("disk_alignment must be a multiple of "+std::to_string(DIRECT_IO_ALIGNMENT));
|
||||||
}
|
}
|
||||||
if (!journal_block_size)
|
if (!journal_block_size)
|
||||||
{
|
{
|
||||||
journal_block_size = 4096;
|
journal_block_size = 4096;
|
||||||
}
|
}
|
||||||
else if (journal_block_size % MEM_ALIGNMENT)
|
else if (journal_block_size % DIRECT_IO_ALIGNMENT)
|
||||||
{
|
{
|
||||||
throw std::runtime_error("journal_block_size must be a multiple of "+std::to_string(MEM_ALIGNMENT));
|
throw std::runtime_error("journal_block_size must be a multiple of "+std::to_string(DIRECT_IO_ALIGNMENT));
|
||||||
}
|
}
|
||||||
if (!meta_block_size)
|
if (!meta_block_size)
|
||||||
{
|
{
|
||||||
meta_block_size = 4096;
|
meta_block_size = 4096;
|
||||||
}
|
}
|
||||||
else if (meta_block_size % MEM_ALIGNMENT)
|
else if (meta_block_size % DIRECT_IO_ALIGNMENT)
|
||||||
{
|
{
|
||||||
throw std::runtime_error("meta_block_size must be a multiple of "+std::to_string(MEM_ALIGNMENT));
|
throw std::runtime_error("meta_block_size must be a multiple of "+std::to_string(DIRECT_IO_ALIGNMENT));
|
||||||
}
|
}
|
||||||
if (data_offset % disk_alignment)
|
if (data_offset % disk_alignment)
|
||||||
{
|
{
|
||||||
|
@@ -276,7 +276,8 @@ resume_4:
|
|||||||
new_id = 1+INODE_NO_POOL(kv.value.uint64_value());
|
new_id = 1+INODE_NO_POOL(kv.value.uint64_value());
|
||||||
max_id_mod_rev = kv.mod_revision;
|
max_id_mod_rev = kv.mod_revision;
|
||||||
}
|
}
|
||||||
auto ino_it = parent->cli->st_cli.inode_config.lower_bound(INODE_WITH_POOL(new_pool_id, 0));
|
// Also check existing inodes - for the case when some inodes are created without changing /index/maxid
|
||||||
|
auto ino_it = parent->cli->st_cli.inode_config.lower_bound(INODE_WITH_POOL(new_pool_id+1, 0));
|
||||||
if (ino_it != parent->cli->st_cli.inode_config.begin())
|
if (ino_it != parent->cli->st_cli.inode_config.begin())
|
||||||
{
|
{
|
||||||
ino_it--;
|
ino_it--;
|
||||||
|
@@ -14,6 +14,11 @@
|
|||||||
std::function<bool(cli_result_t &)> cli_tool_t::simple_offsets(json11::Json cfg)
|
std::function<bool(cli_result_t &)> cli_tool_t::simple_offsets(json11::Json cfg)
|
||||||
{
|
{
|
||||||
std::string device = cfg["device"].string_value();
|
std::string device = cfg["device"].string_value();
|
||||||
|
if (device == "")
|
||||||
|
{
|
||||||
|
fprintf(stderr, "Device path is missing\n");
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
uint64_t object_size = parse_size(cfg["object_size"].string_value());
|
uint64_t object_size = parse_size(cfg["object_size"].string_value());
|
||||||
uint64_t bitmap_granularity = parse_size(cfg["bitmap_granularity"].string_value());
|
uint64_t bitmap_granularity = parse_size(cfg["bitmap_granularity"].string_value());
|
||||||
uint64_t journal_size = parse_size(cfg["journal_size"].string_value());
|
uint64_t journal_size = parse_size(cfg["journal_size"].string_value());
|
||||||
|
@@ -52,7 +52,7 @@ int main(int argc, char *argv[])
|
|||||||
self.journal_block = strtoul(argv[b+1], NULL, 10);
|
self.journal_block = strtoul(argv[b+1], NULL, 10);
|
||||||
self.journal_offset = strtoull(argv[b+2], NULL, 10);
|
self.journal_offset = strtoull(argv[b+2], NULL, 10);
|
||||||
self.journal_len = strtoull(argv[b+3], NULL, 10);
|
self.journal_len = strtoull(argv[b+3], NULL, 10);
|
||||||
if (self.journal_block < MEM_ALIGNMENT || (self.journal_block % MEM_ALIGNMENT) ||
|
if (self.journal_block < DIRECT_IO_ALIGNMENT || (self.journal_block % DIRECT_IO_ALIGNMENT) ||
|
||||||
self.journal_block > 128*1024)
|
self.journal_block > 128*1024)
|
||||||
{
|
{
|
||||||
printf("Invalid journal block size\n");
|
printf("Invalid journal block size\n");
|
||||||
|
173
src/dump_meta.cpp
Normal file
173
src/dump_meta.cpp
Normal file
@@ -0,0 +1,173 @@
|
|||||||
|
// Copyright (c) Vitaliy Filippov, 2019+
|
||||||
|
// License: VNPL-1.1 (see README.md for details)
|
||||||
|
|
||||||
|
#define _LARGEFILE64_SOURCE
|
||||||
|
#include <sys/types.h>
|
||||||
|
#include <sys/ioctl.h>
|
||||||
|
#include <sys/stat.h>
|
||||||
|
#include <sys/time.h>
|
||||||
|
#include <fcntl.h>
|
||||||
|
#include <unistd.h>
|
||||||
|
#include <stdint.h>
|
||||||
|
#include <malloc.h>
|
||||||
|
#include <linux/fs.h>
|
||||||
|
#include <string.h>
|
||||||
|
#include <errno.h>
|
||||||
|
#include <assert.h>
|
||||||
|
#include <stdio.h>
|
||||||
|
|
||||||
|
#include "object_id.h"
|
||||||
|
#include "osd_id.h"
|
||||||
|
|
||||||
|
// "VITAstor"
|
||||||
|
#define BLOCKSTORE_META_MAGIC_V1 0x726F747341544956l
|
||||||
|
#define BLOCKSTORE_META_VERSION_V1 1
|
||||||
|
|
||||||
|
#define DIRECT_IO_ALIGNMENT 512
|
||||||
|
#define MEM_ALIGNMENT 4096
|
||||||
|
|
||||||
|
struct __attribute__((__packed__)) clean_disk_entry_v0_t
|
||||||
|
{
|
||||||
|
object_id oid;
|
||||||
|
uint64_t version;
|
||||||
|
uint8_t bitmap[];
|
||||||
|
};
|
||||||
|
|
||||||
|
struct __attribute__((__packed__)) blockstore_meta_header_v1_t
|
||||||
|
{
|
||||||
|
uint64_t zero;
|
||||||
|
uint64_t magic;
|
||||||
|
uint64_t version;
|
||||||
|
uint32_t meta_block_size;
|
||||||
|
uint32_t data_block_size;
|
||||||
|
uint32_t bitmap_granularity;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct meta_dumper_t
|
||||||
|
{
|
||||||
|
char *meta_device;
|
||||||
|
uint32_t meta_block_size;
|
||||||
|
uint64_t meta_offset;
|
||||||
|
uint64_t meta_len;
|
||||||
|
uint64_t meta_pos;
|
||||||
|
int fd;
|
||||||
|
};
|
||||||
|
|
||||||
|
int main(int argc, char *argv[])
|
||||||
|
{
|
||||||
|
meta_dumper_t self = { 0 };
|
||||||
|
int b = 1;
|
||||||
|
if (argc < b+4)
|
||||||
|
{
|
||||||
|
printf("USAGE: %s <meta_file> <meta_block_size> <offset> <size>\n", argv[0]);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
self.meta_device = argv[b];
|
||||||
|
self.meta_block_size = strtoul(argv[b+1], NULL, 10);
|
||||||
|
self.meta_offset = strtoull(argv[b+2], NULL, 10);
|
||||||
|
self.meta_len = strtoull(argv[b+3], NULL, 10);
|
||||||
|
if (self.meta_block_size % DIRECT_IO_ALIGNMENT)
|
||||||
|
{
|
||||||
|
printf("Invalid metadata block size\n");
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
self.fd = open(self.meta_device, O_DIRECT|O_RDONLY);
|
||||||
|
if (self.fd == -1)
|
||||||
|
{
|
||||||
|
printf("Failed to open metadata device\n");
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
// Read all metadata into memory
|
||||||
|
void *data = memalign(MEM_ALIGNMENT, self.meta_len);
|
||||||
|
if (!data)
|
||||||
|
{
|
||||||
|
printf("Failed to allocate %lu MB of memory\n", self.meta_len/1024/1024);
|
||||||
|
close(self.fd);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
while (self.meta_pos < self.meta_len)
|
||||||
|
{
|
||||||
|
int r = pread(self.fd, data+self.meta_pos, self.meta_len-self.meta_pos, self.meta_offset+self.meta_pos);
|
||||||
|
assert(r > 0);
|
||||||
|
self.meta_pos += r;
|
||||||
|
}
|
||||||
|
close(self.fd);
|
||||||
|
// Check superblock
|
||||||
|
blockstore_meta_header_v1_t *hdr = (blockstore_meta_header_v1_t *)data;
|
||||||
|
if (hdr->zero == 0 &&
|
||||||
|
hdr->magic == BLOCKSTORE_META_MAGIC_V1 &&
|
||||||
|
hdr->version == BLOCKSTORE_META_VERSION_V1)
|
||||||
|
{
|
||||||
|
// Vitastor 0.6-0.7 - static array of clean_disk_entry_v0_t with bitmaps
|
||||||
|
if (hdr->meta_block_size != self.meta_block_size)
|
||||||
|
{
|
||||||
|
printf("Using block size %u bytes based on information from the superblock\n", hdr->meta_block_size);
|
||||||
|
self.meta_block_size = hdr->meta_block_size;
|
||||||
|
}
|
||||||
|
uint64_t clean_entry_bitmap_size = hdr->data_block_size / hdr->bitmap_granularity / 8;
|
||||||
|
uint64_t clean_entry_size = sizeof(clean_disk_entry_v0_t) + 2*clean_entry_bitmap_size;
|
||||||
|
uint64_t block_num = 0;
|
||||||
|
printf(
|
||||||
|
"{\"version\":\"0.6\",\"meta_block_size\":%u,\"data_block_size\":%u,\"bitmap_granularity\":%u,\"entries\":[\n",
|
||||||
|
hdr->meta_block_size, hdr->data_block_size, hdr->bitmap_granularity
|
||||||
|
);
|
||||||
|
bool first = true;
|
||||||
|
for (uint64_t meta_pos = self.meta_block_size; meta_pos < self.meta_len; meta_pos += self.meta_block_size)
|
||||||
|
{
|
||||||
|
for (uint64_t ioff = 0; ioff < self.meta_block_size-clean_entry_size; ioff += clean_entry_size, block_num++)
|
||||||
|
{
|
||||||
|
clean_disk_entry_v0_t *entry = (clean_disk_entry_v0_t*)(data + meta_pos + ioff);
|
||||||
|
if (entry->oid.inode)
|
||||||
|
{
|
||||||
|
printf(
|
||||||
|
#define ENTRY_FMT "{\"block\":%lu,\"pool\":%u,\"inode\":%lu,\"stripe\":%lu,\"version\":%lu,\"bitmap\":\""
|
||||||
|
(first ? (",\n" ENTRY_FMT) : ENTRY_FMT),
|
||||||
|
#undef ENTRY_FMT
|
||||||
|
block_num, INODE_POOL(entry->oid.inode), INODE_NO_POOL(entry->oid.inode),
|
||||||
|
entry->oid.stripe, entry->version
|
||||||
|
);
|
||||||
|
first = false;
|
||||||
|
for (uint64_t i = 0; i < clean_entry_bitmap_size; i++)
|
||||||
|
{
|
||||||
|
printf("%02x", entry->bitmap[i]);
|
||||||
|
}
|
||||||
|
printf("\",\"ext_bitmap\":\"");
|
||||||
|
for (uint64_t i = 0; i < clean_entry_bitmap_size; i++)
|
||||||
|
{
|
||||||
|
printf("%02x", entry->bitmap[clean_entry_bitmap_size + i]);
|
||||||
|
}
|
||||||
|
printf("\"}");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
printf("]}\n");
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
// Vitastor 0.4-0.5 - static array of clean_disk_entry_v0_t
|
||||||
|
uint64_t clean_entry_size = sizeof(clean_disk_entry_v0_t);
|
||||||
|
uint64_t block_num = 0;
|
||||||
|
printf("{\"version\":\"0.5\",\"meta_block_size\":%u,\"entries\":[\n", self.meta_block_size);
|
||||||
|
bool first = true;
|
||||||
|
for (uint64_t meta_pos = 0; meta_pos < self.meta_len; meta_pos += self.meta_block_size)
|
||||||
|
{
|
||||||
|
for (uint64_t ioff = 0; ioff < self.meta_block_size-clean_entry_size; ioff += clean_entry_size, block_num++)
|
||||||
|
{
|
||||||
|
clean_disk_entry_v0_t *entry = (clean_disk_entry_v0_t*)(data + meta_pos + ioff);
|
||||||
|
if (entry->oid.inode)
|
||||||
|
{
|
||||||
|
printf(
|
||||||
|
#define ENTRY_FMT "{\"block\":%lu,\"pool\":%u,\"inode\":%lu,\"stripe\":%lu,\"version\":%lu}"
|
||||||
|
(first ? (",\n" ENTRY_FMT) : ENTRY_FMT),
|
||||||
|
#undef ENTRY_FMT
|
||||||
|
block_num, INODE_POOL(entry->oid.inode), INODE_NO_POOL(entry->oid.inode),
|
||||||
|
entry->oid.stripe, entry->version
|
||||||
|
);
|
||||||
|
first = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
printf("]}\n");
|
||||||
|
}
|
||||||
|
free(data);
|
||||||
|
}
|
2
src/lrc/Makefile
Normal file
2
src/lrc/Makefile
Normal file
@@ -0,0 +1,2 @@
|
|||||||
|
mat: mat.c
|
||||||
|
gcc -O3 -I/usr/include/jerasure -o mat mat.c -lJerasure
|
291
src/lrc/mat.c
Normal file
291
src/lrc/mat.c
Normal file
@@ -0,0 +1,291 @@
|
|||||||
|
#include <jerasure/reed_sol.h>
|
||||||
|
#include <jerasure.h>
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <string.h>
|
||||||
|
#include <assert.h>
|
||||||
|
|
||||||
|
// Generate LRC matrix: (groups*local + global) code rows with (data_drives) columns
|
||||||
|
// w should be >= log2(data_drives + groups*local + global), but not necessary 8/16/32
|
||||||
|
int* reed_sol_vandermonde_lrc_matrix(int data_drives, int groups, int local, int global, int w)
|
||||||
|
{
|
||||||
|
if (w < 0 || w > 32 || data_drives + groups*local + global > (1<<w))
|
||||||
|
{
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
int *lrc_matrix = (int*)malloc(sizeof(int) * (local*groups+global));
|
||||||
|
int *matrix = reed_sol_vandermonde_coding_matrix(data_drives, local+global, w);
|
||||||
|
// Enough to transform LRC 8+2+2 GF(8) matrix into MR-LRC
|
||||||
|
//for (int i = 0; i < local+global; i++)
|
||||||
|
//{
|
||||||
|
// int t = matrix[i*data_drives + 3];
|
||||||
|
// matrix[i*data_drives + 3] = matrix[i*data_drives + 7];
|
||||||
|
// matrix[i*data_drives + 7] = t;
|
||||||
|
//}
|
||||||
|
for (int gr = 0; gr < groups; gr++)
|
||||||
|
{
|
||||||
|
for (int l = 0; l < local; l++)
|
||||||
|
{
|
||||||
|
for (int j = 0; j < data_drives; j++)
|
||||||
|
{
|
||||||
|
lrc_matrix[(gr*local+l)*data_drives + j] = (j / (data_drives/groups)) == gr ? matrix[l*data_drives + j] : 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for (int i = 0; i < global; i++)
|
||||||
|
{
|
||||||
|
for (int j = 0; j < data_drives; j++)
|
||||||
|
{
|
||||||
|
lrc_matrix[(groups*local+i)*data_drives + j] = matrix[(local+i)*data_drives + j];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
free(matrix);
|
||||||
|
return lrc_matrix;
|
||||||
|
}
|
||||||
|
|
||||||
|
struct lrc_test_result_t
|
||||||
|
{
|
||||||
|
int success, impossible, failures;
|
||||||
|
};
|
||||||
|
|
||||||
|
// Check if the generated LRC with given parameters is Maximally Reconstructible (MR-LRC)
|
||||||
|
// Example of a MR-LRC: (8, 2, 1, 2, 6, 8)
|
||||||
|
struct lrc_test_result_t check_mr_lrc(int *lrc_matrix, int data_drives, int groups, int local, int global, int w, int log_level)
|
||||||
|
{
|
||||||
|
int n = data_drives;
|
||||||
|
int total_rows = n + groups*local + global;
|
||||||
|
int impossible = 0, success = 0, failures = 0;
|
||||||
|
int *lost_per_group = (int*)malloc(sizeof(int) * groups);
|
||||||
|
int *recovered_per_group = (int*)malloc(sizeof(int) * groups);
|
||||||
|
int *selected_inverted = (int*)malloc(sizeof(int) * data_drives);
|
||||||
|
// global+1 is always recoverable
|
||||||
|
for (int lost = global+2; lost <= groups*local+global; lost++)
|
||||||
|
{
|
||||||
|
int *erased_matrix = (int*)malloc(sizeof(int) * (total_rows-lost)*n);
|
||||||
|
int *inverted_matrix = (int*)malloc(sizeof(int) * (total_rows-lost)*n);
|
||||||
|
int *p = (int*)malloc(sizeof(int) * (total_rows-lost));
|
||||||
|
for (int i = 0; i < n; i++)
|
||||||
|
p[i] = i;
|
||||||
|
int *p2 = (int*)malloc(sizeof(int) * n);
|
||||||
|
if (total_rows-lost > n)
|
||||||
|
{
|
||||||
|
p[n-1] = n; // skip combinations with all N data disks (0..n-1)
|
||||||
|
for (int i = n; i < total_rows-lost; i++)
|
||||||
|
p[i] = i+1;
|
||||||
|
p[total_rows-lost-1]--; // will be incremented on the first step
|
||||||
|
}
|
||||||
|
int inc = total_rows-lost-1;
|
||||||
|
while (1)
|
||||||
|
{
|
||||||
|
p[inc]++;
|
||||||
|
if (p[inc] >= n+groups*local+global)
|
||||||
|
{
|
||||||
|
if (inc == 0)
|
||||||
|
break;
|
||||||
|
inc--;
|
||||||
|
}
|
||||||
|
else if (inc+1 < total_rows-lost)
|
||||||
|
{
|
||||||
|
p[inc+1] = p[inc];
|
||||||
|
inc++;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
// Check if it should be recoverable
|
||||||
|
// Calculate count of data chunks lost in each group
|
||||||
|
int nsel = 0;
|
||||||
|
for (int gr = 0; gr < groups; gr++)
|
||||||
|
{
|
||||||
|
lost_per_group[gr] = ((gr+1)*(n/groups) > n ? (n - gr*(n/groups)) : n/groups);
|
||||||
|
recovered_per_group[gr] = 0;
|
||||||
|
}
|
||||||
|
for (int j = 0; j < total_rows-lost; j++)
|
||||||
|
{
|
||||||
|
if (p[j] < n)
|
||||||
|
{
|
||||||
|
lost_per_group[(p[j] / (n/groups))]--;
|
||||||
|
selected_inverted[nsel++] = j;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Every local parity chunk is supposed to restore 1 missing chunk inside its group
|
||||||
|
// So, subtract local parity chunk counts from each group lost chunk count
|
||||||
|
for (int j = 0; j < total_rows-lost; j++)
|
||||||
|
{
|
||||||
|
if (p[j] >= n && p[j] < n+groups*local)
|
||||||
|
{
|
||||||
|
int gr = (p[j]-n)/local;
|
||||||
|
if (lost_per_group[gr] > recovered_per_group[gr] && nsel < n)
|
||||||
|
{
|
||||||
|
selected_inverted[nsel++] = j;
|
||||||
|
}
|
||||||
|
recovered_per_group[gr]++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Every global parity chunk is supposed to restore 1 chunk of all that are still missing
|
||||||
|
int still_missing = 0;
|
||||||
|
for (int gr = 0; gr < groups; gr++)
|
||||||
|
{
|
||||||
|
int non_fixed = lost_per_group[gr] - recovered_per_group[gr];
|
||||||
|
still_missing += (non_fixed > 0 ? non_fixed : 0);
|
||||||
|
}
|
||||||
|
for (int j = 0; j < total_rows-lost; j++)
|
||||||
|
{
|
||||||
|
if (p[j] >= n+groups*local)
|
||||||
|
{
|
||||||
|
if (still_missing > 0 && nsel < n)
|
||||||
|
{
|
||||||
|
selected_inverted[nsel++] = j;
|
||||||
|
}
|
||||||
|
still_missing--;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (still_missing <= 0)
|
||||||
|
{
|
||||||
|
// We hope it can be recoverable. Try to invert it
|
||||||
|
assert(nsel == n);
|
||||||
|
for (int i = 0; i < n; i++)
|
||||||
|
{
|
||||||
|
for (int j = 0; j < n; j++)
|
||||||
|
{
|
||||||
|
erased_matrix[i*n+j] = lrc_matrix[p[selected_inverted[i]]*n+j];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
int invert_ok = jerasure_invert_matrix(erased_matrix, inverted_matrix, n, w);
|
||||||
|
if (invert_ok < 0)
|
||||||
|
{
|
||||||
|
failures++;
|
||||||
|
if (log_level > 0)
|
||||||
|
{
|
||||||
|
printf("\nFAIL: ");
|
||||||
|
for (int i = 0; i < total_rows-lost; i++)
|
||||||
|
{
|
||||||
|
printf("%d ", p[i]);
|
||||||
|
}
|
||||||
|
printf("\nDIRECT:\n");
|
||||||
|
for (int i = 0; i < total_rows-lost; i++)
|
||||||
|
{
|
||||||
|
for (int j = 0; j < n; j++)
|
||||||
|
printf("%d ", lrc_matrix[p[i]*n+j]);
|
||||||
|
printf("\n");
|
||||||
|
}
|
||||||
|
printf("INVERSE:\n");
|
||||||
|
for (int i = 0; i < total_rows-lost; i++)
|
||||||
|
{
|
||||||
|
for (int j = 0; j < n; j++)
|
||||||
|
printf("%d ", inverted_matrix[i*n+j]);
|
||||||
|
printf("\n");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
success++;
|
||||||
|
if (log_level > 2)
|
||||||
|
{
|
||||||
|
printf("OK: ");
|
||||||
|
for (int i = 0; i < total_rows-lost; i++)
|
||||||
|
{
|
||||||
|
printf("%d ", p[i]);
|
||||||
|
}
|
||||||
|
printf("\n");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
impossible++;
|
||||||
|
if (log_level > 1)
|
||||||
|
{
|
||||||
|
printf("IMPOSSIBLE: ");
|
||||||
|
for (int i = 0; i < total_rows-lost; i++)
|
||||||
|
{
|
||||||
|
printf("%d ", p[i]);
|
||||||
|
}
|
||||||
|
printf("\n");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
free(p2);
|
||||||
|
free(p);
|
||||||
|
free(inverted_matrix);
|
||||||
|
free(erased_matrix);
|
||||||
|
}
|
||||||
|
free(lost_per_group);
|
||||||
|
free(recovered_per_group);
|
||||||
|
return (struct lrc_test_result_t){
|
||||||
|
.success = success,
|
||||||
|
.impossible = impossible,
|
||||||
|
.failures = failures,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
int main()
|
||||||
|
{
|
||||||
|
int W = 8, MATRIX_W = 8;
|
||||||
|
int n = 8, groups = 2, local = 1, global = 2;
|
||||||
|
//n = 4, groups = 2, local = 1, global = 1;
|
||||||
|
int total_rows = n+groups*local+global;
|
||||||
|
int *matrix = reed_sol_vandermonde_lrc_matrix(n, groups, local, global, MATRIX_W);
|
||||||
|
int *lrc_matrix = (int*)malloc(sizeof(int) * total_rows*n);
|
||||||
|
// Fill identity+LRC matrix
|
||||||
|
for (int i = 0; i < n; i++)
|
||||||
|
for (int j = 0; j < n; j++)
|
||||||
|
lrc_matrix[i*n + j] = j == i ? 1 : 0;
|
||||||
|
memcpy(lrc_matrix + n*n, matrix, (total_rows-n)*n*sizeof(int));
|
||||||
|
free(matrix);
|
||||||
|
matrix = NULL;
|
||||||
|
// Print LRC matrix
|
||||||
|
for (int i = 0; i < total_rows; i++)
|
||||||
|
{
|
||||||
|
for (int j = 0; j < n; j++)
|
||||||
|
{
|
||||||
|
printf("%d ", lrc_matrix[i*n+j]);
|
||||||
|
}
|
||||||
|
printf("\n");
|
||||||
|
}
|
||||||
|
struct lrc_test_result_t t = check_mr_lrc(lrc_matrix, n, groups, local, global, W, 1);
|
||||||
|
printf("\n%d recovered, %d impossible, %d failures\n", t.success, t.impossible, t.failures);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
// 1 1 1 1 0 0 0 0
|
||||||
|
// 0 0 0 0 1 1 1 1
|
||||||
|
// 1 55 39 73 84 181 225 217
|
||||||
|
// 1 172 70 235 143 34 200 101
|
||||||
|
//
|
||||||
|
// Can't recover
|
||||||
|
// 1 2 4 5 8 9 10 11 -1
|
||||||
|
// 2 3 4 6 8 9 10 11 -1
|
||||||
|
// FULL:
|
||||||
|
// 1 0 0 0 0 0 0 0
|
||||||
|
// 0 1 0 0 0 0 0 0
|
||||||
|
// 0 0 1 0 0 0 0 0
|
||||||
|
// 0 0 0 1 0 0 0 0
|
||||||
|
// 0 0 0 0 1 0 0 0
|
||||||
|
// 0 0 0 0 0 1 0 0
|
||||||
|
// 0 0 0 0 0 0 1 0
|
||||||
|
// 0 0 0 0 0 0 0 1
|
||||||
|
// 1 1 1 1 0 0 0 0
|
||||||
|
// 0 0 0 0 1 1 1 1
|
||||||
|
// 1 55 39 73 84 181 225 217
|
||||||
|
// 1 172 70 235 143 34 200 101
|
||||||
|
// FIRST UNRECOVERABLE:
|
||||||
|
// 0 1 0 0 0 0 0 0
|
||||||
|
// 0 0 1 0 0 0 0 0
|
||||||
|
// 0 0 0 0 1 0 0 0
|
||||||
|
// 0 0 0 0 0 1 0 0
|
||||||
|
// 1 1 1 1 0 0 0 0
|
||||||
|
// 0 0 0 0 1 1 1 1
|
||||||
|
// 1 55 39 73 84 181 225 217
|
||||||
|
// 1 172 70 235 143 34 200 101
|
||||||
|
// SECOND UNRECOVERABLE:
|
||||||
|
// 0 0 1 0 0 0 0 0
|
||||||
|
// 0 0 0 1 0 0 0 0
|
||||||
|
// 0 0 0 0 1 0 0 0
|
||||||
|
// 0 0 0 0 0 0 1 0
|
||||||
|
// 1 1 1 1 0 0 0 0
|
||||||
|
// 0 0 0 0 1 1 1 1
|
||||||
|
// 1 55 39 73 84 181 225 217
|
||||||
|
// 1 172 70 235 143 34 200 101
|
||||||
|
// Ho ho ho
|
@@ -30,13 +30,19 @@
|
|||||||
#define OSD_OP_PING 15
|
#define OSD_OP_PING 15
|
||||||
#define OSD_OP_SEC_READ_BMP 16
|
#define OSD_OP_SEC_READ_BMP 16
|
||||||
#define OSD_OP_MAX 16
|
#define OSD_OP_MAX 16
|
||||||
// Alignment & limit for read/write operations
|
|
||||||
#ifndef MEM_ALIGNMENT
|
|
||||||
#define MEM_ALIGNMENT 512
|
|
||||||
#endif
|
|
||||||
#define OSD_RW_MAX 64*1024*1024
|
#define OSD_RW_MAX 64*1024*1024
|
||||||
#define OSD_PROTOCOL_VERSION 1
|
#define OSD_PROTOCOL_VERSION 1
|
||||||
|
|
||||||
|
// Memory alignment for direct I/O (usually 512 bytes)
|
||||||
|
#ifndef DIRECT_IO_ALIGNMENT
|
||||||
|
#define DIRECT_IO_ALIGNMENT 512
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// Memory allocation alignment (page size is usually optimal)
|
||||||
|
#ifndef MEM_ALIGNMENT
|
||||||
|
#define MEM_ALIGNMENT 4096
|
||||||
|
#endif
|
||||||
|
|
||||||
// common request and reply headers
|
// common request and reply headers
|
||||||
struct __attribute__((__packed__)) osd_op_header_t
|
struct __attribute__((__packed__)) osd_op_header_t
|
||||||
{
|
{
|
||||||
|
@@ -4,11 +4,13 @@
|
|||||||
#include <stdexcept>
|
#include <stdexcept>
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
#include <assert.h>
|
#include <assert.h>
|
||||||
|
extern "C" {
|
||||||
#include <reed_sol.h>
|
#include <reed_sol.h>
|
||||||
#include <jerasure.h>
|
#include <jerasure.h>
|
||||||
#ifdef WITH_ISAL
|
#ifdef WITH_ISAL
|
||||||
#include <isa-l/erasure_code.h>
|
#include <isa-l/erasure_code.h>
|
||||||
#endif
|
#endif
|
||||||
|
}
|
||||||
#include <map>
|
#include <map>
|
||||||
#include "allocator.h"
|
#include "allocator.h"
|
||||||
#include "xor.h"
|
#include "xor.h"
|
||||||
|
@@ -7,6 +7,7 @@
|
|||||||
#include "object_id.h"
|
#include "object_id.h"
|
||||||
#include "osd_id.h"
|
#include "osd_id.h"
|
||||||
|
|
||||||
|
// Memory allocation alignment (page size is usually optimal)
|
||||||
#ifndef MEM_ALIGNMENT
|
#ifndef MEM_ALIGNMENT
|
||||||
#define MEM_ALIGNMENT 4096
|
#define MEM_ALIGNMENT 4096
|
||||||
#endif
|
#endif
|
||||||
|
@@ -12,6 +12,8 @@ SCHEME=ec ./test_change_pg_count.sh
|
|||||||
|
|
||||||
./test_change_pg_size.sh
|
./test_change_pg_size.sh
|
||||||
|
|
||||||
|
./test_create_nomaxid.sh
|
||||||
|
|
||||||
./test_etcd_fail.sh
|
./test_etcd_fail.sh
|
||||||
|
|
||||||
./test_failure_domain.sh
|
./test_failure_domain.sh
|
||||||
|
21
tests/test_create_nomaxid.sh
Executable file
21
tests/test_create_nomaxid.sh
Executable file
@@ -0,0 +1,21 @@
|
|||||||
|
#!/bin/bash -ex
|
||||||
|
|
||||||
|
# Test vitastor-cli create when /index/maxid is out of sync
|
||||||
|
|
||||||
|
. `dirname $0`/run_3osds.sh
|
||||||
|
|
||||||
|
$ETCDCTL put /vitastor/config/inode/1/120 '{"name":"testimg","size":'$((1024*1024*1024))'}'
|
||||||
|
|
||||||
|
build/src/vitastor-cli create --etcd_address $ETCD_URL -s 1G testimg2
|
||||||
|
|
||||||
|
t=$($ETCDCTL get --print-value-only /vitastor/config/inode/1/121 | jq -r .name)
|
||||||
|
if [[ "$t" != "testimg2" ]]; then
|
||||||
|
format_error "testimg2 should've been created as inode 121"
|
||||||
|
fi
|
||||||
|
|
||||||
|
t=$($ETCDCTL get --print-value-only /vitastor/index/maxid/1)
|
||||||
|
if [[ "$t" != 121 ]]; then
|
||||||
|
format_error "/index/maxid should've been set to 121"
|
||||||
|
fi
|
||||||
|
|
||||||
|
format_green OK
|
Reference in New Issue
Block a user