Compare commits

..

13 Commits

Author SHA1 Message Date
8fdf30b21f Release 0.8.1
- Remove an additional data copy operation when flushing journal (should
  slightly increase write performance)
- Fix a bug where new writes in the inmemory_journal=false mode could overwrite
  the data currently read by a parallel read operation
- Fix degraded parity writes for EC N+K when K>1 where the bug could also lead
  to an "assertion failed" error
- Fix missing journal space check for "big" writes which could lead to
  "prefill_single_journal_entry(): assertion failed..." error in OSD
- Fix possible "assertion failed: next->prev_wait >= 0" in client in rare cases
- Fix missing "len" field in vitastor-disk write-journal big_writes
- Fix possible crash of a full OSD (ENOSPC)
- Fix CSI build scripts to include newest packages every time
- Fix CSI endpoint in the liveness probe manifest
2022-11-20 11:44:09 +03:00
238037ae31 Make journal trimmer wait until reads are completed when inmemory_journal is false
Without this new writes may in theory overwrite journal data being read at that time
2022-11-20 01:49:21 +03:00
09a8864686 Fix degraded parity writes for EC N+K when K>1
Fixes possible `calc_rmw_parity_ec(): Assertion `bufs[i][curbuf[i]].buf' failed` error
2022-11-20 00:50:13 +03:00
6e6f6ecbb0 Add missing journal space check for big_writes
Fixes possible `prefill_single_journal_entry(): Assertion `!journal.sector_info[journal.cur_sector].flush_count' failed` error
2022-11-20 00:50:13 +03:00
9491f81419 Add missing documentation for OSD tags 2022-11-20 00:50:13 +03:00
44c2b30167 Take newest packages every time when rebuilding CSI 2022-11-20 00:50:13 +03:00
bf8a0581cd Fix possible "assertion failed: next->prev_wait >= 0" in client 2022-11-20 00:50:13 +03:00
5953942042 Add crc32c test utility 2022-11-20 00:50:13 +03:00
a276a1f737 Do not copy journal data additional time when flushing 2022-11-20 00:50:13 +03:00
cc24e5796e Add a FIXME 2022-11-20 00:50:09 +03:00
6e26732e6a Fix skipped "len" field in vitastor-disk write-journal big_writes 2022-11-12 12:01:40 +03:00
b4edc79449 Fix possible segfault on ENOSPC 2022-11-12 11:59:43 +03:00
5f26887d32 Fix csi endpoint in liveness probe 2022-11-10 18:37:37 +03:00
29 changed files with 264 additions and 74 deletions

View File

@@ -2,6 +2,6 @@ cmake_minimum_required(VERSION 2.8)
project(vitastor)
set(VERSION "0.8.0")
set(VERSION "0.8.1")
add_subdirectory(src)

View File

@@ -18,15 +18,19 @@ ENV CSI_ENDPOINT=""
RUN apt-get update && \
apt-get install -y wget && \
wget -q -O /etc/apt/trusted.gpg.d/vitastor.gpg https://vitastor.io/debian/pubkey.gpg && \
(echo deb http://vitastor.io/debian buster main > /etc/apt/sources.list.d/vitastor.list) && \
(echo deb http://deb.debian.org/debian buster-backports main > /etc/apt/sources.list.d/backports.list) && \
(echo "APT::Install-Recommends false;" > /etc/apt/apt.conf) && \
apt-get update && \
apt-get install -y e2fsprogs xfsprogs vitastor kmod && \
apt-get install -y e2fsprogs xfsprogs kmod && \
apt-get clean && \
(echo options nbd nbds_max=128 > /etc/modprobe.d/nbd.conf)
COPY --from=build /app/vitastor-csi /bin/
RUN (echo deb http://vitastor.io/debian buster main > /etc/apt/sources.list.d/vitastor.list) && \
wget -q -O /etc/apt/trusted.gpg.d/vitastor.gpg https://vitastor.io/debian/pubkey.gpg && \
apt-get update && \
apt-get install -y vitastor-client && \
apt-get clean
ENTRYPOINT ["/bin/vitastor-csi"]

View File

@@ -1,4 +1,4 @@
VERSION ?= v0.8.0
VERSION ?= v0.8.1
all: build push

View File

@@ -49,7 +49,7 @@ spec:
capabilities:
add: ["SYS_ADMIN"]
allowPrivilegeEscalation: true
image: vitalif/vitastor-csi:v0.8.0
image: vitalif/vitastor-csi:v0.8.1
args:
- "--node=$(NODE_ID)"
- "--endpoint=$(CSI_ENDPOINT)"
@@ -102,7 +102,7 @@ spec:
- "--health-port=9898"
env:
- name: CSI_ENDPOINT
value: unix://csi/csi.sock
value: unix:///csi/csi.sock
volumeMounts:
- mountPath: /csi
name: socket-dir

View File

@@ -116,7 +116,7 @@ spec:
privileged: true
capabilities:
add: ["SYS_ADMIN"]
image: vitalif/vitastor-csi:v0.8.0
image: vitalif/vitastor-csi:v0.8.1
args:
- "--node=$(NODE_ID)"
- "--endpoint=$(CSI_ENDPOINT)"

View File

@@ -5,7 +5,7 @@ package vitastor
const (
vitastorCSIDriverName = "csi.vitastor.io"
vitastorCSIDriverVersion = "0.8.0"
vitastorCSIDriverVersion = "0.8.1"
)
// Config struct fills the parameters of request or user input

4
debian/changelog vendored
View File

@@ -1,10 +1,10 @@
vitastor (0.8.0-1) unstable; urgency=medium
vitastor (0.8.1-1) unstable; urgency=medium
* Bugfixes
-- Vitaliy Filippov <vitalif@yourcmc.ru> Fri, 03 Jun 2022 02:09:44 +0300
vitastor (0.8.0-1) unstable; urgency=medium
vitastor (0.8.1-1) unstable; urgency=medium
* Implement NFS proxy
* Add documentation

View File

@@ -34,8 +34,8 @@ RUN set -e -x; \
mkdir -p /root/packages/vitastor-$REL; \
rm -rf /root/packages/vitastor-$REL/*; \
cd /root/packages/vitastor-$REL; \
cp -r /root/vitastor vitastor-0.8.0; \
cd vitastor-0.8.0; \
cp -r /root/vitastor vitastor-0.8.1; \
cd vitastor-0.8.1; \
ln -s /root/fio-build/fio-*/ ./fio; \
FIO=$(head -n1 fio/debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
ls /usr/include/linux/raw.h || cp ./debian/raw.h /usr/include/linux/raw.h; \
@@ -48,8 +48,8 @@ RUN set -e -x; \
rm -rf a b; \
echo "dep:fio=$FIO" > debian/fio_version; \
cd /root/packages/vitastor-$REL; \
tar --sort=name --mtime='2020-01-01' --owner=0 --group=0 --exclude=debian -cJf vitastor_0.8.0.orig.tar.xz vitastor-0.8.0; \
cd vitastor-0.8.0; \
tar --sort=name --mtime='2020-01-01' --owner=0 --group=0 --exclude=debian -cJf vitastor_0.8.1.orig.tar.xz vitastor-0.8.1; \
cd vitastor-0.8.1; \
V=$(head -n1 debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
DEBFULLNAME="Vitaliy Filippov <vitalif@yourcmc.ru>" dch -D $REL -v "$V""$REL" "Rebuild for $REL"; \
DEB_BUILD_OPTIONS=nocheck dpkg-buildpackage --jobs=auto -sa; \

View File

@@ -82,7 +82,7 @@ Parent node reference is required for intermediate tree nodes.
Separate OSD settings are set in etc keys `/vitastor/config/osd/<number>`
in JSON format `{"<key>":<value>}`.
As of now, there is only one setting:
As of now, two settings are supported:
## reweight
@@ -96,6 +96,15 @@ This means an OSD configured with reweight lower than 1 receives less PGs than
it normally would. An OSD with reweight = 0 won't store any data. You can set
reweight to 0 to trigger rebalance and remove all data from an OSD.
## tags
- Type: string or array of strings
Sets tag or multiple tags for this OSD. Tags can be used to group OSDs into
subsets and then use a specific subset for pool instead of all OSDs.
For example you can mark SSD OSDs with tag "ssd" and HDD OSDs with "hdd" and
such tags will work as device classes.
# Pool parameters
## name

View File

@@ -81,7 +81,10 @@
Настройки отдельных OSD задаются в ключах etcd `/vitastor/config/osd/<number>`
в JSON-формате `{"<key>":<value>}`.
На данный момент поддерживается одна настройка:
На данный момент поддерживаются две настройки:
- [reweight](#reweight)
- [tags](#tags)
## reweight
@@ -96,6 +99,15 @@
хранении данных вообще. Вы можете установить reweight в 0, чтобы убрать
все данные с OSD.
## tags
- Тип: строка или массив строк
Задаёт тег или набор тегов для данного OSD. Теги можно использовать, чтобы
делить OSD на множества и потом размещать пул только на части OSD, а не на
всех. Можно, например, пометить SSD OSD тегом "ssd", а HDD тегом "hdd", в
этом смысле теги работают аналогично классам устройств.
# Параметры
## name

View File

@@ -50,7 +50,7 @@ from cinder.volume import configuration
from cinder.volume import driver
from cinder.volume import volume_utils
VERSION = '0.8.0'
VERSION = '0.8.1'
LOG = logging.getLogger(__name__)

View File

@@ -25,4 +25,4 @@ rm fio
mv fio-copy fio
FIO=`rpm -qi fio | perl -e 'while(<>) { /^Epoch[\s:]+(\S+)/ && print "$1:"; /^Version[\s:]+(\S+)/ && print $1; /^Release[\s:]+(\S+)/ && print "-$1"; }'`
perl -i -pe 's/(Requires:\s*fio)([^\n]+)?/$1 = '$FIO'/' $VITASTOR/rpm/vitastor-el$EL.spec
tar --transform 's#^#vitastor-0.8.0/#' --exclude 'rpm/*.rpm' -czf $VITASTOR/../vitastor-0.8.0$(rpm --eval '%dist').tar.gz *
tar --transform 's#^#vitastor-0.8.1/#' --exclude 'rpm/*.rpm' -czf $VITASTOR/../vitastor-0.8.1$(rpm --eval '%dist').tar.gz *

View File

@@ -58,7 +58,7 @@
+BuildRequires: gperftools-devel
+BuildRequires: libusbx-devel >= 1.0.21
%if %{have_usbredir}
BuildRequires: usbredir-devel >= 0.8.0
BuildRequires: usbredir-devel >= 0.8.1
%endif
@@ -856,12 +861,13 @@ BuildRequires: virglrenderer-devel
# For smartcard NSS support

View File

@@ -35,7 +35,7 @@ ADD . /root/vitastor
RUN set -e; \
cd /root/vitastor/rpm; \
sh build-tarball.sh; \
cp /root/vitastor-0.8.0.el7.tar.gz ~/rpmbuild/SOURCES; \
cp /root/vitastor-0.8.1.el7.tar.gz ~/rpmbuild/SOURCES; \
cp vitastor-el7.spec ~/rpmbuild/SPECS/vitastor.spec; \
cd ~/rpmbuild/SPECS/; \
rpmbuild -ba vitastor.spec; \

View File

@@ -1,11 +1,11 @@
Name: vitastor
Version: 0.8.0
Version: 0.8.1
Release: 1%{?dist}
Summary: Vitastor, a fast software-defined clustered block storage
License: Vitastor Network Public License 1.1
URL: https://vitastor.io/
Source0: vitastor-0.8.0.el7.tar.gz
Source0: vitastor-0.8.1.el7.tar.gz
BuildRequires: liburing-devel >= 0.6
BuildRequires: gperftools-devel

View File

@@ -35,7 +35,7 @@ ADD . /root/vitastor
RUN set -e; \
cd /root/vitastor/rpm; \
sh build-tarball.sh; \
cp /root/vitastor-0.8.0.el8.tar.gz ~/rpmbuild/SOURCES; \
cp /root/vitastor-0.8.1.el8.tar.gz ~/rpmbuild/SOURCES; \
cp vitastor-el8.spec ~/rpmbuild/SPECS/vitastor.spec; \
cd ~/rpmbuild/SPECS/; \
rpmbuild -ba vitastor.spec; \

View File

@@ -1,11 +1,11 @@
Name: vitastor
Version: 0.8.0
Version: 0.8.1
Release: 1%{?dist}
Summary: Vitastor, a fast software-defined clustered block storage
License: Vitastor Network Public License 1.1
URL: https://vitastor.io/
Source0: vitastor-0.8.0.el8.tar.gz
Source0: vitastor-0.8.1.el8.tar.gz
BuildRequires: liburing-devel >= 0.6
BuildRequires: gperftools-devel

View File

@@ -15,7 +15,7 @@ if("${CMAKE_INSTALL_PREFIX}" MATCHES "^/usr/local/?$")
set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}")
endif()
add_definitions(-DVERSION="0.8.0")
add_definitions(-DVERSION="0.8.1")
add_definitions(-Wall -Wno-sign-compare -Wno-comment -Wno-parentheses -Wno-pointer-arith -fdiagnostics-color=always -I ${CMAKE_SOURCE_DIR}/src)
if (${WITH_ASAN})
add_definitions(-fsanitize=address -fno-omit-frame-pointer)
@@ -263,6 +263,14 @@ target_link_libraries(test_cas
vitastor_client
)
# test_crc32
add_executable(test_crc32
test_crc32.cpp
)
target_link_libraries(test_crc32
vitastor_blk
)
# test_cluster_client
add_executable(test_cluster_client
test_cluster_client.cpp

View File

@@ -615,7 +615,11 @@ resume_1:
}
for (it = v.begin(); it != v.end(); it++)
{
free(it->buf);
// Free it if it's not taken from the journal
if (it->buf && (!bs->journal.inmemory || it->buf < bs->journal.buffer || it->buf >= bs->journal.buffer + bs->journal.len))
{
free(it->buf);
}
}
v.clear();
// And sync metadata (in batches - not per each operation!)
@@ -760,16 +764,17 @@ bool journal_flusher_co::scan_dirty(int wait_base)
{
submit_offset = dirty_it->second.location + offset - dirty_it->second.offset;
submit_len = it == v.end() || it->offset >= end_offset ? end_offset-offset : it->offset-offset;
it = v.insert(it, (copy_buffer_t){ .offset = offset, .len = submit_len, .buf = memalign_or_die(MEM_ALIGNMENT, submit_len) });
it = v.insert(it, (copy_buffer_t){ .offset = offset, .len = submit_len });
copy_count++;
if (bs->journal.inmemory)
{
// Take it from memory
memcpy(it->buf, (uint8_t*)bs->journal.buffer + submit_offset, submit_len);
// Take it from memory, don't copy it
it->buf = (uint8_t*)bs->journal.buffer + submit_offset;
}
else
{
// Read it from disk
it->buf = memalign_or_die(MEM_ALIGNMENT, submit_len);
await_sqe(0);
data->iov = (struct iovec){ it->buf, (size_t)submit_len };
data->callback = simple_callback_r;

View File

@@ -166,6 +166,7 @@ struct __attribute__((__packed__)) dirty_entry
struct fulfill_read_t
{
uint64_t offset, len;
uint64_t journal_sector; // sector+1 if used and !journal.inmemory, otherwise 0
};
#define PRIV(op) ((blockstore_op_private_t*)(op)->private_data)
@@ -305,7 +306,7 @@ class blockstore_impl_t
// Read
int dequeue_read(blockstore_op_t *read_op);
int fulfill_read(blockstore_op_t *read_op, uint64_t &fulfilled, uint32_t item_start, uint32_t item_end,
uint32_t item_state, uint64_t item_version, uint64_t item_location);
uint32_t item_state, uint64_t item_version, uint64_t item_location, uint64_t journal_sector);
int fulfill_read_push(blockstore_op_t *op, void *buf, uint64_t offset, uint64_t len,
uint32_t item_state, uint64_t item_version);
void handle_read_event(ring_data_t *data, blockstore_op_t *op);

View File

@@ -42,7 +42,7 @@ int blockstore_impl_t::fulfill_read_push(blockstore_op_t *op, void *buf, uint64_
// FIXME I've seen a bug here so I want some tests
int blockstore_impl_t::fulfill_read(blockstore_op_t *read_op, uint64_t &fulfilled, uint32_t item_start, uint32_t item_end,
uint32_t item_state, uint64_t item_version, uint64_t item_location)
uint32_t item_state, uint64_t item_version, uint64_t item_location, uint64_t journal_sector)
{
uint32_t cur_start = item_start;
if (cur_start < read_op->offset + read_op->len && item_end > read_op->offset)
@@ -72,6 +72,7 @@ int blockstore_impl_t::fulfill_read(blockstore_op_t *read_op, uint64_t &fulfille
fulfill_read_t el = {
.offset = cur_start,
.len = it == PRIV(read_op)->read_vec.end() || it->offset >= item_end ? item_end-cur_start : it->offset-cur_start,
.journal_sector = journal_sector,
};
it = PRIV(read_op)->read_vec.insert(it, el);
if (!fulfill_read_push(read_op,
@@ -156,8 +157,10 @@ int blockstore_impl_t::dequeue_read(blockstore_op_t *read_op)
memcpy(read_op->bitmap, bmp_ptr, dsk.clean_entry_bitmap_size);
}
}
// If inmemory_journal is false, journal trim will have to wait until the read is completed
if (!fulfill_read(read_op, fulfilled, dirty.offset, dirty.offset + dirty.len,
dirty.state, dirty_it->first.version, dirty.location + (IS_JOURNAL(dirty.state) ? 0 : dirty.offset)))
dirty.state, dirty_it->first.version, dirty.location + (IS_JOURNAL(dirty.state) ? 0 : dirty.offset),
(IS_JOURNAL(dirty.state) ? dirty.journal_sector+1 : 0)))
{
// need to wait. undo added requests, don't dequeue op
PRIV(read_op)->read_vec.clear();
@@ -186,7 +189,8 @@ int blockstore_impl_t::dequeue_read(blockstore_op_t *read_op)
{
if (!dsk.clean_entry_bitmap_size)
{
if (!fulfill_read(read_op, fulfilled, 0, dsk.data_block_size, (BS_ST_BIG_WRITE | BS_ST_STABLE), 0, clean_it->second.location))
if (!fulfill_read(read_op, fulfilled, 0, dsk.data_block_size,
(BS_ST_BIG_WRITE | BS_ST_STABLE), 0, clean_it->second.location, 0))
{
// need to wait. undo added requests, don't dequeue op
PRIV(read_op)->read_vec.clear();
@@ -207,7 +211,7 @@ int blockstore_impl_t::dequeue_read(blockstore_op_t *read_op)
{
// fill with zeroes
assert(fulfill_read(read_op, fulfilled, bmp_start * dsk.bitmap_granularity,
bmp_end * dsk.bitmap_granularity, (BS_ST_DELETE | BS_ST_STABLE), 0, 0));
bmp_end * dsk.bitmap_granularity, (BS_ST_DELETE | BS_ST_STABLE), 0, 0, 0));
}
bmp_start = bmp_end;
while (clean_entry_bitmap[bmp_end >> 3] & (1 << (bmp_end & 0x7)) && bmp_end < bmp_size)
@@ -218,7 +222,7 @@ int blockstore_impl_t::dequeue_read(blockstore_op_t *read_op)
{
if (!fulfill_read(read_op, fulfilled, bmp_start * dsk.bitmap_granularity,
bmp_end * dsk.bitmap_granularity, (BS_ST_BIG_WRITE | BS_ST_STABLE), 0,
clean_it->second.location + bmp_start * dsk.bitmap_granularity))
clean_it->second.location + bmp_start * dsk.bitmap_granularity, 0))
{
// need to wait. undo added requests, don't dequeue op
PRIV(read_op)->read_vec.clear();
@@ -233,7 +237,7 @@ int blockstore_impl_t::dequeue_read(blockstore_op_t *read_op)
else if (fulfilled < read_op->len)
{
// fill remaining parts with zeroes
assert(fulfill_read(read_op, fulfilled, 0, dsk.data_block_size, (BS_ST_DELETE | BS_ST_STABLE), 0, 0));
assert(fulfill_read(read_op, fulfilled, 0, dsk.data_block_size, (BS_ST_DELETE | BS_ST_STABLE), 0, 0, 0));
}
assert(fulfilled == read_op->len);
read_op->version = result_version;
@@ -249,6 +253,15 @@ int blockstore_impl_t::dequeue_read(blockstore_op_t *read_op)
FINISH_OP(read_op);
return 2;
}
if (!journal.inmemory)
{
// Journal trim has to wait until the read is completed - record journal sector usage
for (auto & rv: PRIV(read_op)->read_vec)
{
if (rv.journal_sector)
journal.used_sectors[rv.journal_sector-1]++;
}
}
read_op->retval = 0;
return 2;
}
@@ -264,6 +277,19 @@ void blockstore_impl_t::handle_read_event(ring_data_t *data, blockstore_op_t *op
}
if (PRIV(op)->pending_ops == 0)
{
if (!journal.inmemory)
{
// Release journal sector usage
for (auto & rv: PRIV(op)->read_vec)
{
if (rv.journal_sector)
{
auto used = --journal.used_sectors[rv.journal_sector-1];
if (used == 0)
journal.used_sectors.erase(rv.journal_sector-1);
}
}
}
if (op->retval == 0)
op->retval = op->len;
FINISH_OP(op);

View File

@@ -222,7 +222,7 @@ void blockstore_impl_t::erase_dirty(blockstore_dirty_db_t::iterator dirty_start,
#endif
data_alloc->set(dirty_it->second.location >> dsk.block_order, false);
}
int used = --journal.used_sectors[dirty_it->second.journal_sector];
auto used = --journal.used_sectors[dirty_it->second.journal_sector];
#ifdef BLOCKSTORE_DEBUG
printf(
"remove usage of journal offset %08lx by %lx:%lx v%lu (%d refs)\n", dirty_it->second.journal_sector,

View File

@@ -182,7 +182,8 @@ void blockstore_impl_t::cancel_all_writes(blockstore_op_t *op, blockstore_dirty_
bool found = false;
for (auto other_op: submit_queue)
{
if (!found && other_op == op)
// <op> may be present in queue multiple times due to moving operations in submit_queue
if (other_op == op)
found = true;
else if (found && other_op->oid == op->oid &&
(other_op->opcode == BS_OP_WRITE || other_op->opcode == BS_OP_WRITE_STABLE))
@@ -448,6 +449,12 @@ int blockstore_impl_t::continue_write(blockstore_op_t *op)
resume_2:
// Only for the immediate_commit mode: prepare and submit big_write journal entry
{
blockstore_journal_check_t space_check(this);
if (!space_check.check_available(op, 1,
sizeof(journal_entry_big_write) + dsk.clean_entry_bitmap_size, JOURNAL_STABILIZE_RESERVATION))
{
return 0;
}
BS_SUBMIT_CHECK_SQES(1);
auto dirty_it = dirty_db.find((obj_ver_id){
.oid = op->oid,

View File

@@ -221,9 +221,11 @@ void cluster_client_t::erase_op(cluster_op_t *op)
if (op_queue_tail == op)
op_queue_tail = op->prev;
op->next = op->prev = NULL;
std::function<void(cluster_op_t*)>(op->callback)(op);
if (!(flags & OP_IMMEDIATE_COMMIT))
inc_wait(opcode, flags, next, -1);
// Call callback at the end to avoid inconsistencies in prev_wait
// if the callback adds more operations itself
std::function<void(cluster_op_t*)>(op->callback)(op);
}
void cluster_client_t::continue_ops(bool up_retry)

View File

@@ -424,6 +424,7 @@ int disk_tool_t::write_json_journal(json11::Json entries)
.stripe = sscanf_json(NULL, rec["stripe"]),
},
.version = rec["ver"].uint64_value(),
.len = (uint32_t)rec["len"].uint64_value(),
.location = sscanf_json(NULL, rec["loc"]),
};
fromhexstr(rec["bitmap"].string_value(), dsk.clean_entry_bitmap_size, ((uint8_t*)ne) + sizeof(journal_entry_big_write));

View File

@@ -478,15 +478,18 @@ void* calc_rmw(void *request_buf, osd_rmw_stripe_t *stripes, uint64_t *read_osd_
{
if (write_osd_set[role] != 0)
{
write_parity = 1;
write_parity++;
if (write_osd_set[role] != read_osd_set[role])
{
start = 0;
end = chunk_size;
for (int r2 = pg_minsize; r2 < role; r2++)
{
stripes[r2].write_start = start;
stripes[r2].write_end = end;
if (write_osd_set[r2] != 0)
{
stripes[r2].write_start = start;
stripes[r2].write_end = end;
}
}
}
stripes[role].write_start = start;
@@ -555,7 +558,7 @@ void* calc_rmw(void *request_buf, osd_rmw_stripe_t *stripes, uint64_t *read_osd_
}
}
// Allocate read buffers
void *rmw_buf = alloc_read_buffer(stripes, pg_size, (write_parity ? pg_size-pg_minsize : 0) * (end - start));
void *rmw_buf = alloc_read_buffer(stripes, pg_size, write_parity * (end - start));
// Position write buffers
uint64_t buf_pos = 0, in_pos = 0;
for (int role = 0; role < pg_size; role++)
@@ -804,13 +807,11 @@ void calc_rmw_parity_ec(osd_rmw_stripe_t *stripes, int pg_size, int pg_minsize,
calc_rmw_parity_copy_mod(stripes, pg_size, pg_minsize, read_osd_set, write_osd_set, chunk_size, bitmap_granularity, start, end);
if (end != 0)
{
int i;
for (i = pg_minsize; i < pg_size; i++)
{
int write_parity = 0;
for (int i = pg_minsize; i < pg_size; i++)
if (write_osd_set[i] != 0)
break;
}
if (i < pg_size)
write_parity++;
if (write_parity > 0)
{
// Calculate new coding chunks
buf_len_t bufs[pg_size][3];
@@ -830,8 +831,11 @@ void calc_rmw_parity_ec(osd_rmw_stripe_t *stripes, int pg_size, int pg_minsize,
}
for (int i = pg_minsize; i < pg_size; i++)
{
bufs[i][nbuf[i]++] = { .buf = stripes[i].write_buf, .len = end-start };
positions[i] = start;
if (write_osd_set[i] != 0)
{
bufs[i][nbuf[i]++] = { .buf = stripes[i].write_buf, .len = end-start };
positions[i] = start;
}
}
uint32_t pos = start;
while (pos < end)
@@ -839,31 +843,37 @@ void calc_rmw_parity_ec(osd_rmw_stripe_t *stripes, int pg_size, int pg_minsize,
uint32_t next_end = end;
for (int i = 0; i < pg_size; i++)
{
assert(curbuf[i] < nbuf[i]);
assert(bufs[i][curbuf[i]].buf);
data_ptrs[i] = (uint8_t*)bufs[i][curbuf[i]].buf + pos-positions[i];
uint32_t this_end = bufs[i][curbuf[i]].len + positions[i];
if (next_end > this_end)
next_end = this_end;
if (i < pg_minsize || write_osd_set[i] != 0)
{
assert(curbuf[i] < nbuf[i]);
assert(bufs[i][curbuf[i]].buf);
data_ptrs[i] = (uint8_t*)bufs[i][curbuf[i]].buf + pos-positions[i];
uint32_t this_end = bufs[i][curbuf[i]].len + positions[i];
if (next_end > this_end)
next_end = this_end;
}
}
assert(next_end > pos);
for (int i = 0; i < pg_size; i++)
{
uint32_t this_end = bufs[i][curbuf[i]].len + positions[i];
if (next_end >= this_end)
if (i < pg_minsize || write_osd_set[i] != 0)
{
positions[i] += bufs[i][curbuf[i]].len;
curbuf[i]++;
uint32_t this_end = bufs[i][curbuf[i]].len + positions[i];
if (next_end >= this_end)
{
positions[i] += bufs[i][curbuf[i]].len;
curbuf[i]++;
}
}
}
#ifdef WITH_ISAL
ec_encode_data(
next_end-pos, pg_minsize, pg_size-pg_minsize, matrix->isal_data,
next_end-pos, pg_minsize, write_parity, matrix->isal_data,
(uint8_t**)data_ptrs, (uint8_t**)data_ptrs+pg_minsize
);
#else
jerasure_matrix_encode(
pg_minsize, pg_size-pg_minsize, OSD_JERASURE_W, matrix->je_data,
pg_minsize, write_parity, OSD_JERASURE_W, matrix->je_data,
(char**)data_ptrs, (char**)data_ptrs+pg_minsize, next_end-pos
);
#endif
@@ -871,16 +881,19 @@ void calc_rmw_parity_ec(osd_rmw_stripe_t *stripes, int pg_size, int pg_minsize,
}
for (int i = 0; i < pg_size; i++)
{
data_ptrs[i] = stripes[i].bmp_buf;
if (i < pg_minsize || write_osd_set[i] != 0)
{
data_ptrs[i] = stripes[i].bmp_buf;
}
}
#ifdef WITH_ISAL
ec_encode_data(
bitmap_size, pg_minsize, pg_size-pg_minsize, matrix->isal_data,
bitmap_size, pg_minsize, write_parity, matrix->isal_data,
(uint8_t**)data_ptrs, (uint8_t**)data_ptrs+pg_minsize
);
#else
jerasure_matrix_encode(
pg_minsize, pg_size-pg_minsize, OSD_JERASURE_W, matrix->je_data,
pg_minsize, write_parity, OSD_JERASURE_W, matrix->je_data,
(char**)data_ptrs, (char**)data_ptrs+pg_minsize, bitmap_size
);
#endif

View File

@@ -20,6 +20,7 @@ void test11();
void test12();
void test13();
void test14();
void test15();
int main(int narg, char *args[])
{
@@ -47,6 +48,8 @@ int main(int narg, char *args[])
test13();
// Test 14
test14();
// Test 15
test15();
// End
printf("all ok\n");
return 0;
@@ -706,7 +709,7 @@ void test13()
/***
13. basic jerasure 2+1 test
14. basic jerasure 2+1 test
calc_rmw(offset=128K-4K, len=8K, osd_set=[1,2,0], write_set=[1,2,3])
= {
read: [ [ 0, 128K ], [ 0, 128K ], [ 0, 0 ] ],
@@ -727,13 +730,13 @@ void test14()
osd_num_t write_osd_set[3] = { 1, 2, 3 };
osd_rmw_stripe_t stripes[3] = {};
unsigned bitmaps[3] = { 0 };
// Test 13.0
// Test 14.0
void *write_buf = malloc_or_die(8192);
split_stripes(2, 128*1024, 128*1024-4096, 8192, stripes);
assert(stripes[0].req_start == 128*1024-4096 && stripes[0].req_end == 128*1024);
assert(stripes[1].req_start == 0 && stripes[1].req_end == 4096);
assert(stripes[2].req_start == 0 && stripes[2].req_end == 0);
// Test 13.1
// Test 14.1
void *rmw_buf = calc_rmw(write_buf, stripes, osd_set, 3, 2, 3, write_osd_set, 128*1024, bmp);
for (int i = 0; i < 3; i++)
stripes[i].bmp_buf = bitmaps+i;
@@ -750,7 +753,7 @@ void test14()
assert(stripes[0].write_buf == write_buf);
assert(stripes[1].write_buf == (uint8_t*)write_buf+4096);
assert(stripes[2].write_buf == rmw_buf);
// Test 13.2 - encode
// Test 14.2 - encode
set_pattern(write_buf, 8192, PATTERN3);
set_pattern(stripes[0].read_buf, 128*1024-4096, PATTERN1);
set_pattern(stripes[1].read_buf, 128*1024-4096, PATTERN2);
@@ -767,7 +770,7 @@ void test14()
assert(stripes[0].write_buf == write_buf);
assert(stripes[1].write_buf == (uint8_t*)write_buf+4096);
assert(stripes[2].write_buf == rmw_buf);
// Test 13.3 - decode and verify
// Test 14.3 - decode and verify
osd_num_t read_osd_set[4] = { 0, 2, 3 };
memset(stripes, 0, sizeof(stripes));
split_stripes(2, 128*1024, 0, 128*1024, stripes);
@@ -802,3 +805,74 @@ void test14()
free(write_buf);
use_ec(3, 2, false);
}
/***
15. EC 2+2 partial overwrite with 1 missing stripe
calc_rmw(offset=64K+28K, len=4K, osd_set=[1,2,3,0], write_set=[1,2,3,0])
= {
read: [ [ 28K, 32K ], [ 0, 0 ], [ 0, 0 ], [ 0, 0 ] ],
write: [ [ 0, 0 ], [ 28K, 32K ], [ 28K, 32K ], [ 0, 0 ] ],
input buffer: [ write1 ],
rmw buffer: [ write2, read0 ],
}
***/
void test15()
{
const int bmp = 64*1024 / 4096 / 8;
use_ec(4, 2, true);
osd_num_t osd_set[4] = { 1, 2, 3, 0 };
osd_num_t write_osd_set[4] = { 1, 2, 3, 0 };
osd_rmw_stripe_t stripes[4] = {};
unsigned bitmaps[4] = { 0 };
// Test 15.0
void *write_buf = malloc_or_die(4096);
split_stripes(2, 64*1024, (64+28)*1024, 4096, stripes);
assert(stripes[0].req_start == 0 && stripes[0].req_end == 0);
assert(stripes[1].req_start == 28*1024 && stripes[1].req_end == 32*1024);
assert(stripes[2].req_start == 0 && stripes[2].req_end == 0);
assert(stripes[3].req_start == 0 && stripes[3].req_end == 0);
// Test 15.1
void *rmw_buf = calc_rmw(write_buf, stripes, osd_set, 4, 2, 3, write_osd_set, 64*1024, bmp);
for (int i = 0; i < 4; i++)
stripes[i].bmp_buf = bitmaps+i;
assert(rmw_buf);
assert(stripes[0].read_start == 28*1024 && stripes[0].read_end == 32*1024);
assert(stripes[1].read_start == 0 && stripes[1].read_end == 0);
assert(stripes[2].read_start == 0 && stripes[2].read_end == 0);
assert(stripes[3].read_start == 0 && stripes[3].read_end == 0);
assert(stripes[0].write_start == 0 && stripes[0].write_end == 0);
assert(stripes[1].write_start == 28*1024 && stripes[1].write_end == 32*1024);
assert(stripes[2].write_start == 28*1024 && stripes[2].write_end == 32*1024);
assert(stripes[3].write_start == 0 && stripes[3].write_end == 0);
assert(stripes[0].read_buf == (uint8_t*)rmw_buf+4*1024);
assert(stripes[1].read_buf == NULL);
assert(stripes[2].read_buf == NULL);
assert(stripes[3].read_buf == NULL);
assert(stripes[0].write_buf == NULL);
assert(stripes[1].write_buf == (uint8_t*)write_buf);
assert(stripes[2].write_buf == rmw_buf);
assert(stripes[3].write_buf == NULL);
// Test 15.2 - encode
set_pattern(write_buf, 4*1024, PATTERN1);
set_pattern(stripes[0].read_buf, 4*1024, PATTERN2);
memset(stripes[0].bmp_buf, 0, bmp);
memset(stripes[1].bmp_buf, 0, bmp);
calc_rmw_parity_ec(stripes, 4, 2, osd_set, write_osd_set, 64*1024, bmp);
assert(*(uint32_t*)stripes[2].bmp_buf == 0x80);
assert(stripes[0].write_start == 0 && stripes[0].write_end == 0);
assert(stripes[1].write_start == 28*1024 && stripes[1].write_end == 32*1024);
assert(stripes[2].write_start == 28*1024 && stripes[2].write_end == 32*1024);
assert(stripes[3].write_start == 0 && stripes[3].write_end == 0);
assert(stripes[0].write_buf == NULL);
assert(stripes[1].write_buf == (uint8_t*)write_buf);
assert(stripes[2].write_buf == rmw_buf);
assert(stripes[3].write_buf == NULL);
check_pattern(stripes[2].write_buf, 4*1024, PATTERN1^PATTERN2); // first parity is always xor :)
// Done
free(rmw_buf);
free(write_buf);
use_ec(3, 2, false);
}

28
src/test_crc32.cpp Normal file
View File

@@ -0,0 +1,28 @@
// Copyright (c) Vitaliy Filippov, 2019+
// License: VNPL-1.1 (see README.md for details)
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <stdint.h>
#include "malloc_or_die.h"
#include "errno.h"
#include "crc32c.h"
int main(int narg, char *args[])
{
int bufsize = 65536;
uint8_t *buf = (uint8_t*)malloc_or_die(bufsize);
uint32_t csum = 0;
while (1)
{
int r = read(0, buf, bufsize);
if (r <= 0 && errno != EAGAIN && errno != EINTR)
break;
csum = crc32c(csum, buf, r);
}
free(buf);
printf("%08x\n", csum);
return 0;
}

View File

@@ -6,7 +6,7 @@ includedir=${prefix}/@CMAKE_INSTALL_INCLUDEDIR@
Name: Vitastor
Description: Vitastor client library
Version: 0.8.0
Version: 0.8.1
Libs: -L${libdir} -lvitastor_client
Cflags: -I${includedir}