Compare commits

...

6 Commits

19 changed files with 532 additions and 16 deletions

View File

@@ -1,2 +1,3 @@
usr/bin/vitastor-osd
usr/bin/vitastor-dump-journal
usr/bin/vitastor-dump-meta

View File

@@ -110,7 +110,7 @@
Схема избыточности, используемая в данном пуле. "jerasure" - синоним для "ec",
в обеих схемах используются коды Рида-Соломона-Вандермонда, реализованные на
основе библиотек ISA-L или jerasure. Быстрая реализацяю на основе ISA-L
основе библиотек ISA-L или jerasure. Быстрая реализация на основе ISA-L
используется автоматически, когда доступна, в противном случае используется
более медленная jerasure-версия.

View File

@@ -464,7 +464,7 @@ class VitastorDriver(driver.CloneableImageVD,
vol_name = utils.convert_str(volume.name)
snap_name = utils.convert_str(snapshot.name)
snap = self._get_image(vol_name+'@'+snap_name)
snap = self._get_image('volume-'+snapshot.volume_id+'@'+snap_name)
if not snap:
raise exception.SnapshotNotFound(snapshot_id = snap_name)
snap_inode_id = int(resp['responses'][0]['kvs'][0]['value']['id'])

View File

@@ -113,6 +113,7 @@ cp -r mon %buildroot/usr/lib/vitastor
%files -n vitastor-osd
%_bindir/vitastor-osd
%_bindir/vitastor-dump-journal
%_bindir/vitastor-dump-meta
%files -n vitastor-mon

View File

@@ -110,6 +110,7 @@ cp -r mon %buildroot/usr/lib/vitastor
%files -n vitastor-osd
%_bindir/vitastor-osd
%_bindir/vitastor-dump-journal
%_bindir/vitastor-dump-meta
%files -n vitastor-mon

View File

@@ -198,6 +198,11 @@ add_executable(vitastor-dump-journal
dump_journal.cpp crc32c.c
)
# vitastor-dump-meta
add_executable(vitastor-dump-meta
dump_meta.cpp
)
if (${WITH_QEMU})
# qemu_driver.so
add_library(qemu_vitastor SHARED
@@ -275,7 +280,7 @@ target_include_directories(test_cluster_client PUBLIC ${CMAKE_SOURCE_DIR}/src/mo
### Install
install(TARGETS vitastor-osd vitastor-dump-journal vitastor-nbd vitastor-nfs vitastor-cli RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
install(TARGETS vitastor-osd vitastor-dump-journal vitastor-dump-meta vitastor-nbd vitastor-nfs vitastor-cli RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
install_symlink(vitastor-cli ${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_BINDIR}/vitastor-rm)
install_symlink(vitastor-cli ${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_BINDIR}/vita)
install(

View File

@@ -19,7 +19,11 @@
#include "timerfd_manager.h"
// Memory alignment for direct I/O (usually 512 bytes)
// All other alignments must be a multiple of this one
#ifndef DIRECT_IO_ALIGNMENT
#define DIRECT_IO_ALIGNMENT 512
#endif
// Memory allocation alignment (page size is usually optimal)
#ifndef MEM_ALIGNMENT
#define MEM_ALIGNMENT 4096
#endif

View File

@@ -109,25 +109,25 @@ void blockstore_impl_t::parse_config(blockstore_config_t & config)
{
disk_alignment = 4096;
}
else if (disk_alignment % MEM_ALIGNMENT)
else if (disk_alignment % DIRECT_IO_ALIGNMENT)
{
throw std::runtime_error("disk_alignment must be a multiple of "+std::to_string(MEM_ALIGNMENT));
throw std::runtime_error("disk_alignment must be a multiple of "+std::to_string(DIRECT_IO_ALIGNMENT));
}
if (!journal_block_size)
{
journal_block_size = 4096;
}
else if (journal_block_size % MEM_ALIGNMENT)
else if (journal_block_size % DIRECT_IO_ALIGNMENT)
{
throw std::runtime_error("journal_block_size must be a multiple of "+std::to_string(MEM_ALIGNMENT));
throw std::runtime_error("journal_block_size must be a multiple of "+std::to_string(DIRECT_IO_ALIGNMENT));
}
if (!meta_block_size)
{
meta_block_size = 4096;
}
else if (meta_block_size % MEM_ALIGNMENT)
else if (meta_block_size % DIRECT_IO_ALIGNMENT)
{
throw std::runtime_error("meta_block_size must be a multiple of "+std::to_string(MEM_ALIGNMENT));
throw std::runtime_error("meta_block_size must be a multiple of "+std::to_string(DIRECT_IO_ALIGNMENT));
}
if (data_offset % disk_alignment)
{

View File

@@ -276,7 +276,8 @@ resume_4:
new_id = 1+INODE_NO_POOL(kv.value.uint64_value());
max_id_mod_rev = kv.mod_revision;
}
auto ino_it = parent->cli->st_cli.inode_config.lower_bound(INODE_WITH_POOL(new_pool_id, 0));
// Also check existing inodes - for the case when some inodes are created without changing /index/maxid
auto ino_it = parent->cli->st_cli.inode_config.lower_bound(INODE_WITH_POOL(new_pool_id+1, 0));
if (ino_it != parent->cli->st_cli.inode_config.begin())
{
ino_it--;

View File

@@ -14,6 +14,11 @@
std::function<bool(cli_result_t &)> cli_tool_t::simple_offsets(json11::Json cfg)
{
std::string device = cfg["device"].string_value();
if (device == "")
{
fprintf(stderr, "Device path is missing\n");
exit(1);
}
uint64_t object_size = parse_size(cfg["object_size"].string_value());
uint64_t bitmap_granularity = parse_size(cfg["bitmap_granularity"].string_value());
uint64_t journal_size = parse_size(cfg["journal_size"].string_value());

View File

@@ -52,7 +52,7 @@ int main(int argc, char *argv[])
self.journal_block = strtoul(argv[b+1], NULL, 10);
self.journal_offset = strtoull(argv[b+2], NULL, 10);
self.journal_len = strtoull(argv[b+3], NULL, 10);
if (self.journal_block < MEM_ALIGNMENT || (self.journal_block % MEM_ALIGNMENT) ||
if (self.journal_block < DIRECT_IO_ALIGNMENT || (self.journal_block % DIRECT_IO_ALIGNMENT) ||
self.journal_block > 128*1024)
{
printf("Invalid journal block size\n");

173
src/dump_meta.cpp Normal file
View File

@@ -0,0 +1,173 @@
// Copyright (c) Vitaliy Filippov, 2019+
// License: VNPL-1.1 (see README.md for details)
#define _LARGEFILE64_SOURCE
#include <sys/types.h>
#include <sys/ioctl.h>
#include <sys/stat.h>
#include <sys/time.h>
#include <fcntl.h>
#include <unistd.h>
#include <stdint.h>
#include <malloc.h>
#include <linux/fs.h>
#include <string.h>
#include <errno.h>
#include <assert.h>
#include <stdio.h>
#include "object_id.h"
#include "osd_id.h"
// "VITAstor"
#define BLOCKSTORE_META_MAGIC_V1 0x726F747341544956l
#define BLOCKSTORE_META_VERSION_V1 1
#define DIRECT_IO_ALIGNMENT 512
#define MEM_ALIGNMENT 4096
struct __attribute__((__packed__)) clean_disk_entry_v0_t
{
object_id oid;
uint64_t version;
uint8_t bitmap[];
};
struct __attribute__((__packed__)) blockstore_meta_header_v1_t
{
uint64_t zero;
uint64_t magic;
uint64_t version;
uint32_t meta_block_size;
uint32_t data_block_size;
uint32_t bitmap_granularity;
};
struct meta_dumper_t
{
char *meta_device;
uint32_t meta_block_size;
uint64_t meta_offset;
uint64_t meta_len;
uint64_t meta_pos;
int fd;
};
int main(int argc, char *argv[])
{
meta_dumper_t self = { 0 };
int b = 1;
if (argc < b+4)
{
printf("USAGE: %s <meta_file> <meta_block_size> <offset> <size>\n", argv[0]);
return 1;
}
self.meta_device = argv[b];
self.meta_block_size = strtoul(argv[b+1], NULL, 10);
self.meta_offset = strtoull(argv[b+2], NULL, 10);
self.meta_len = strtoull(argv[b+3], NULL, 10);
if (self.meta_block_size % DIRECT_IO_ALIGNMENT)
{
printf("Invalid metadata block size\n");
return 1;
}
self.fd = open(self.meta_device, O_DIRECT|O_RDONLY);
if (self.fd == -1)
{
printf("Failed to open metadata device\n");
return 1;
}
// Read all metadata into memory
void *data = memalign(MEM_ALIGNMENT, self.meta_len);
if (!data)
{
printf("Failed to allocate %lu MB of memory\n", self.meta_len/1024/1024);
close(self.fd);
return 1;
}
while (self.meta_pos < self.meta_len)
{
int r = pread(self.fd, data+self.meta_pos, self.meta_len-self.meta_pos, self.meta_offset+self.meta_pos);
assert(r > 0);
self.meta_pos += r;
}
close(self.fd);
// Check superblock
blockstore_meta_header_v1_t *hdr = (blockstore_meta_header_v1_t *)data;
if (hdr->zero == 0 &&
hdr->magic == BLOCKSTORE_META_MAGIC_V1 &&
hdr->version == BLOCKSTORE_META_VERSION_V1)
{
// Vitastor 0.6-0.7 - static array of clean_disk_entry_v0_t with bitmaps
if (hdr->meta_block_size != self.meta_block_size)
{
printf("Using block size %u bytes based on information from the superblock\n", hdr->meta_block_size);
self.meta_block_size = hdr->meta_block_size;
}
uint64_t clean_entry_bitmap_size = hdr->data_block_size / hdr->bitmap_granularity / 8;
uint64_t clean_entry_size = sizeof(clean_disk_entry_v0_t) + 2*clean_entry_bitmap_size;
uint64_t block_num = 0;
printf(
"{\"version\":\"0.6\",\"meta_block_size\":%u,\"data_block_size\":%u,\"bitmap_granularity\":%u,\"entries\":[\n",
hdr->meta_block_size, hdr->data_block_size, hdr->bitmap_granularity
);
bool first = true;
for (uint64_t meta_pos = self.meta_block_size; meta_pos < self.meta_len; meta_pos += self.meta_block_size)
{
for (uint64_t ioff = 0; ioff < self.meta_block_size-clean_entry_size; ioff += clean_entry_size, block_num++)
{
clean_disk_entry_v0_t *entry = (clean_disk_entry_v0_t*)(data + meta_pos + ioff);
if (entry->oid.inode)
{
printf(
#define ENTRY_FMT "{\"block\":%lu,\"pool\":%u,\"inode\":%lu,\"stripe\":%lu,\"version\":%lu,\"bitmap\":\""
(first ? (",\n" ENTRY_FMT) : ENTRY_FMT),
#undef ENTRY_FMT
block_num, INODE_POOL(entry->oid.inode), INODE_NO_POOL(entry->oid.inode),
entry->oid.stripe, entry->version
);
first = false;
for (uint64_t i = 0; i < clean_entry_bitmap_size; i++)
{
printf("%02x", entry->bitmap[i]);
}
printf("\",\"ext_bitmap\":\"");
for (uint64_t i = 0; i < clean_entry_bitmap_size; i++)
{
printf("%02x", entry->bitmap[clean_entry_bitmap_size + i]);
}
printf("\"}");
}
}
}
printf("]}\n");
}
else
{
// Vitastor 0.4-0.5 - static array of clean_disk_entry_v0_t
uint64_t clean_entry_size = sizeof(clean_disk_entry_v0_t);
uint64_t block_num = 0;
printf("{\"version\":\"0.5\",\"meta_block_size\":%u,\"entries\":[\n", self.meta_block_size);
bool first = true;
for (uint64_t meta_pos = 0; meta_pos < self.meta_len; meta_pos += self.meta_block_size)
{
for (uint64_t ioff = 0; ioff < self.meta_block_size-clean_entry_size; ioff += clean_entry_size, block_num++)
{
clean_disk_entry_v0_t *entry = (clean_disk_entry_v0_t*)(data + meta_pos + ioff);
if (entry->oid.inode)
{
printf(
#define ENTRY_FMT "{\"block\":%lu,\"pool\":%u,\"inode\":%lu,\"stripe\":%lu,\"version\":%lu}"
(first ? (",\n" ENTRY_FMT) : ENTRY_FMT),
#undef ENTRY_FMT
block_num, INODE_POOL(entry->oid.inode), INODE_NO_POOL(entry->oid.inode),
entry->oid.stripe, entry->version
);
first = false;
}
}
}
printf("]}\n");
}
free(data);
}

2
src/lrc/Makefile Normal file
View File

@@ -0,0 +1,2 @@
mat: mat.c
gcc -O3 -I/usr/include/jerasure -o mat mat.c -lJerasure

291
src/lrc/mat.c Normal file
View File

@@ -0,0 +1,291 @@
#include <jerasure/reed_sol.h>
#include <jerasure.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <assert.h>
// Generate LRC matrix: (groups*local + global) code rows with (data_drives) columns
// w should be >= log2(data_drives + groups*local + global), but not necessary 8/16/32
int* reed_sol_vandermonde_lrc_matrix(int data_drives, int groups, int local, int global, int w)
{
if (w < 0 || w > 32 || data_drives + groups*local + global > (1<<w))
{
return NULL;
}
int *lrc_matrix = (int*)malloc(sizeof(int) * (local*groups+global));
int *matrix = reed_sol_vandermonde_coding_matrix(data_drives, local+global, w);
// Enough to transform LRC 8+2+2 GF(8) matrix into MR-LRC
//for (int i = 0; i < local+global; i++)
//{
// int t = matrix[i*data_drives + 3];
// matrix[i*data_drives + 3] = matrix[i*data_drives + 7];
// matrix[i*data_drives + 7] = t;
//}
for (int gr = 0; gr < groups; gr++)
{
for (int l = 0; l < local; l++)
{
for (int j = 0; j < data_drives; j++)
{
lrc_matrix[(gr*local+l)*data_drives + j] = (j / (data_drives/groups)) == gr ? matrix[l*data_drives + j] : 0;
}
}
}
for (int i = 0; i < global; i++)
{
for (int j = 0; j < data_drives; j++)
{
lrc_matrix[(groups*local+i)*data_drives + j] = matrix[(local+i)*data_drives + j];
}
}
free(matrix);
return lrc_matrix;
}
struct lrc_test_result_t
{
int success, impossible, failures;
};
// Check if the generated LRC with given parameters is Maximally Reconstructible (MR-LRC)
// Example of a MR-LRC: (8, 2, 1, 2, 6, 8)
struct lrc_test_result_t check_mr_lrc(int *lrc_matrix, int data_drives, int groups, int local, int global, int w, int log_level)
{
int n = data_drives;
int total_rows = n + groups*local + global;
int impossible = 0, success = 0, failures = 0;
int *lost_per_group = (int*)malloc(sizeof(int) * groups);
int *recovered_per_group = (int*)malloc(sizeof(int) * groups);
int *selected_inverted = (int*)malloc(sizeof(int) * data_drives);
// global+1 is always recoverable
for (int lost = global+2; lost <= groups*local+global; lost++)
{
int *erased_matrix = (int*)malloc(sizeof(int) * (total_rows-lost)*n);
int *inverted_matrix = (int*)malloc(sizeof(int) * (total_rows-lost)*n);
int *p = (int*)malloc(sizeof(int) * (total_rows-lost));
for (int i = 0; i < n; i++)
p[i] = i;
int *p2 = (int*)malloc(sizeof(int) * n);
if (total_rows-lost > n)
{
p[n-1] = n; // skip combinations with all N data disks (0..n-1)
for (int i = n; i < total_rows-lost; i++)
p[i] = i+1;
p[total_rows-lost-1]--; // will be incremented on the first step
}
int inc = total_rows-lost-1;
while (1)
{
p[inc]++;
if (p[inc] >= n+groups*local+global)
{
if (inc == 0)
break;
inc--;
}
else if (inc+1 < total_rows-lost)
{
p[inc+1] = p[inc];
inc++;
}
else
{
// Check if it should be recoverable
// Calculate count of data chunks lost in each group
int nsel = 0;
for (int gr = 0; gr < groups; gr++)
{
lost_per_group[gr] = ((gr+1)*(n/groups) > n ? (n - gr*(n/groups)) : n/groups);
recovered_per_group[gr] = 0;
}
for (int j = 0; j < total_rows-lost; j++)
{
if (p[j] < n)
{
lost_per_group[(p[j] / (n/groups))]--;
selected_inverted[nsel++] = j;
}
}
// Every local parity chunk is supposed to restore 1 missing chunk inside its group
// So, subtract local parity chunk counts from each group lost chunk count
for (int j = 0; j < total_rows-lost; j++)
{
if (p[j] >= n && p[j] < n+groups*local)
{
int gr = (p[j]-n)/local;
if (lost_per_group[gr] > recovered_per_group[gr] && nsel < n)
{
selected_inverted[nsel++] = j;
}
recovered_per_group[gr]++;
}
}
// Every global parity chunk is supposed to restore 1 chunk of all that are still missing
int still_missing = 0;
for (int gr = 0; gr < groups; gr++)
{
int non_fixed = lost_per_group[gr] - recovered_per_group[gr];
still_missing += (non_fixed > 0 ? non_fixed : 0);
}
for (int j = 0; j < total_rows-lost; j++)
{
if (p[j] >= n+groups*local)
{
if (still_missing > 0 && nsel < n)
{
selected_inverted[nsel++] = j;
}
still_missing--;
}
}
if (still_missing <= 0)
{
// We hope it can be recoverable. Try to invert it
assert(nsel == n);
for (int i = 0; i < n; i++)
{
for (int j = 0; j < n; j++)
{
erased_matrix[i*n+j] = lrc_matrix[p[selected_inverted[i]]*n+j];
}
}
int invert_ok = jerasure_invert_matrix(erased_matrix, inverted_matrix, n, w);
if (invert_ok < 0)
{
failures++;
if (log_level > 0)
{
printf("\nFAIL: ");
for (int i = 0; i < total_rows-lost; i++)
{
printf("%d ", p[i]);
}
printf("\nDIRECT:\n");
for (int i = 0; i < total_rows-lost; i++)
{
for (int j = 0; j < n; j++)
printf("%d ", lrc_matrix[p[i]*n+j]);
printf("\n");
}
printf("INVERSE:\n");
for (int i = 0; i < total_rows-lost; i++)
{
for (int j = 0; j < n; j++)
printf("%d ", inverted_matrix[i*n+j]);
printf("\n");
}
}
}
else
{
success++;
if (log_level > 2)
{
printf("OK: ");
for (int i = 0; i < total_rows-lost; i++)
{
printf("%d ", p[i]);
}
printf("\n");
}
}
}
else
{
impossible++;
if (log_level > 1)
{
printf("IMPOSSIBLE: ");
for (int i = 0; i < total_rows-lost; i++)
{
printf("%d ", p[i]);
}
printf("\n");
}
}
}
}
free(p2);
free(p);
free(inverted_matrix);
free(erased_matrix);
}
free(lost_per_group);
free(recovered_per_group);
return (struct lrc_test_result_t){
.success = success,
.impossible = impossible,
.failures = failures,
};
}
int main()
{
int W = 8, MATRIX_W = 8;
int n = 8, groups = 2, local = 1, global = 2;
//n = 4, groups = 2, local = 1, global = 1;
int total_rows = n+groups*local+global;
int *matrix = reed_sol_vandermonde_lrc_matrix(n, groups, local, global, MATRIX_W);
int *lrc_matrix = (int*)malloc(sizeof(int) * total_rows*n);
// Fill identity+LRC matrix
for (int i = 0; i < n; i++)
for (int j = 0; j < n; j++)
lrc_matrix[i*n + j] = j == i ? 1 : 0;
memcpy(lrc_matrix + n*n, matrix, (total_rows-n)*n*sizeof(int));
free(matrix);
matrix = NULL;
// Print LRC matrix
for (int i = 0; i < total_rows; i++)
{
for (int j = 0; j < n; j++)
{
printf("%d ", lrc_matrix[i*n+j]);
}
printf("\n");
}
struct lrc_test_result_t t = check_mr_lrc(lrc_matrix, n, groups, local, global, W, 1);
printf("\n%d recovered, %d impossible, %d failures\n", t.success, t.impossible, t.failures);
return 0;
}
// 1 1 1 1 0 0 0 0
// 0 0 0 0 1 1 1 1
// 1 55 39 73 84 181 225 217
// 1 172 70 235 143 34 200 101
//
// Can't recover
// 1 2 4 5 8 9 10 11 -1
// 2 3 4 6 8 9 10 11 -1
// FULL:
// 1 0 0 0 0 0 0 0
// 0 1 0 0 0 0 0 0
// 0 0 1 0 0 0 0 0
// 0 0 0 1 0 0 0 0
// 0 0 0 0 1 0 0 0
// 0 0 0 0 0 1 0 0
// 0 0 0 0 0 0 1 0
// 0 0 0 0 0 0 0 1
// 1 1 1 1 0 0 0 0
// 0 0 0 0 1 1 1 1
// 1 55 39 73 84 181 225 217
// 1 172 70 235 143 34 200 101
// FIRST UNRECOVERABLE:
// 0 1 0 0 0 0 0 0
// 0 0 1 0 0 0 0 0
// 0 0 0 0 1 0 0 0
// 0 0 0 0 0 1 0 0
// 1 1 1 1 0 0 0 0
// 0 0 0 0 1 1 1 1
// 1 55 39 73 84 181 225 217
// 1 172 70 235 143 34 200 101
// SECOND UNRECOVERABLE:
// 0 0 1 0 0 0 0 0
// 0 0 0 1 0 0 0 0
// 0 0 0 0 1 0 0 0
// 0 0 0 0 0 0 1 0
// 1 1 1 1 0 0 0 0
// 0 0 0 0 1 1 1 1
// 1 55 39 73 84 181 225 217
// 1 172 70 235 143 34 200 101
// Ho ho ho

View File

@@ -30,13 +30,19 @@
#define OSD_OP_PING 15
#define OSD_OP_SEC_READ_BMP 16
#define OSD_OP_MAX 16
// Alignment & limit for read/write operations
#ifndef MEM_ALIGNMENT
#define MEM_ALIGNMENT 512
#endif
#define OSD_RW_MAX 64*1024*1024
#define OSD_PROTOCOL_VERSION 1
// Memory alignment for direct I/O (usually 512 bytes)
#ifndef DIRECT_IO_ALIGNMENT
#define DIRECT_IO_ALIGNMENT 512
#endif
// Memory allocation alignment (page size is usually optimal)
#ifndef MEM_ALIGNMENT
#define MEM_ALIGNMENT 4096
#endif
// common request and reply headers
struct __attribute__((__packed__)) osd_op_header_t
{

View File

@@ -4,11 +4,13 @@
#include <stdexcept>
#include <string.h>
#include <assert.h>
extern "C" {
#include <reed_sol.h>
#include <jerasure.h>
#ifdef WITH_ISAL
#include <isa-l/erasure_code.h>
#endif
}
#include <map>
#include "allocator.h"
#include "xor.h"

View File

@@ -7,6 +7,7 @@
#include "object_id.h"
#include "osd_id.h"
// Memory allocation alignment (page size is usually optimal)
#ifndef MEM_ALIGNMENT
#define MEM_ALIGNMENT 4096
#endif

View File

@@ -12,6 +12,8 @@ SCHEME=ec ./test_change_pg_count.sh
./test_change_pg_size.sh
./test_create_nomaxid.sh
./test_etcd_fail.sh
./test_failure_domain.sh

21
tests/test_create_nomaxid.sh Executable file
View File

@@ -0,0 +1,21 @@
#!/bin/bash -ex
# Test vitastor-cli create when /index/maxid is out of sync
. `dirname $0`/run_3osds.sh
$ETCDCTL put /vitastor/config/inode/1/120 '{"name":"testimg","size":'$((1024*1024*1024))'}'
build/src/vitastor-cli create --etcd_address $ETCD_URL -s 1G testimg2
t=$($ETCDCTL get --print-value-only /vitastor/config/inode/1/121 | jq -r .name)
if [[ "$t" != "testimg2" ]]; then
format_error "testimg2 should've been created as inode 121"
fi
t=$($ETCDCTL get --print-value-only /vitastor/index/maxid/1)
if [[ "$t" != 121 ]]; then
format_error "/index/maxid should've been set to 121"
fi
format_green OK