Compare commits

...

24 Commits

Author SHA1 Message Date
a45e0e5e67 Use custom decoding instead of just jerasure_matrix_decode()
- Cache the decoding matrix
- Don't do unnecessary erasures->erased conversion during decoding
- Avoid extra memory allocations during decoding
- Don't always reconstruct coding chunks
- Reconstruct chunks one-by-one, without overlapping ranges
2020-12-04 17:43:48 +03:00
44656fbf67 Allow writes with low version numbers after a delete 2020-12-04 11:54:41 +03:00
089f138e0c Allow situations where the journal contains a big_write(v1) after delete(v2) and v1 < v2
Fixes a crash in the following scenario:
- client issues a delete request (object version is at least 2)
- OSD has time to flush it to the metadata, but doesn't have time to move the journal start pointer on disk
- client overwrites the same object and it gets the version number 1 again
- OSD is restarted and sees delete(v=2), big_write(v=1) in the journal
- dirty_db sequence gets broken and OSD crashes with assert("Writes and deletes shouldn't happen at the same time")
2020-12-04 11:47:27 +03:00
bcc8e697f9 Delete PGs when deleting pools
(All OSD crash with "Online PG count change not allowed" if you try to delete an active pool though)
2020-12-04 11:47:27 +03:00
a4c46ba745 Add jerasure EC support (reed_sol_van, others are slower) (not tested yet) 2020-12-04 11:47:27 +03:00
5596ad8997 Use custom QEMU build for CentOS 7 2020-12-04 11:47:05 +03:00
59c29b0cee Fix RPATH for CentOS builds, add additional repos into the CentOS installation instructions 2020-12-04 11:47:04 +03:00
959089b919 Enable progress_notify=true for etcd watches 2020-11-17 16:29:42 +03:00
d3e7749616 Final fixes for packaging 2020-11-10 23:33:07 +03:00
b56f8820ec Container packaging for Debian 11 Bullseye, CentOS 7 and CentOS 8 2020-11-10 00:02:53 +03:00
4bd2bd48eb Build Vitastor packages, too 2020-11-09 14:41:39 +03:00
a3fc9f8d7d Add a Dockerfile to build patched QEMU for Debian (Buster) 2020-11-09 02:30:41 +03:00
530975aed7 Make it also build with GCC 8 and on Debian Buster 2020-11-09 00:07:07 +03:00
1446aad107 Simple patch for qemu-kvm .spec 2020-11-08 02:14:53 +03:00
46479e2456 Add RPM build scripts for CentOS 8 2020-11-08 01:55:17 +03:00
e41bee72a5 Lower node.js requirement to 10.x 2020-11-08 01:54:12 +03:00
2e0f223ddb Add RPM build scripts for CentOS 7 2020-11-07 01:52:10 +03:00
3be7bc29d8 Make it build with QEMU 2.0, too
Also begin to work on rpms
2020-11-06 20:05:00 +03:00
0c43ff9daf Add scripts to copy fio and qemu includes to the source package 2020-11-06 18:40:42 +03:00
64d471cf53 Add simple Debian packaging 2020-11-06 18:40:42 +03:00
809b2ad8cd Add install target 2020-11-06 01:12:22 +03:00
550d4af151 Rename test.cpp to test_shit.cpp (random shit) 2020-11-06 01:12:22 +03:00
cf0f23ab8e Add patches for QEMU QAPI IDL 2020-11-04 23:30:51 +03:00
a516fefa8c Add qemu_module_dummy and qemu_stamp_xxx to qemu_driver.c 2020-11-04 23:10:29 +03:00
53 changed files with 2336 additions and 540 deletions

17
.dockerignore Normal file
View File

@@ -0,0 +1,17 @@
.git
build
mon/node_modules
*.o
*.so
osd
stub_osd
stub_uring_osd
stub_bench
osd_test
dump_journal
nbd_proxy
rm_inode
fio
qemu
rpm/*.Dockerfile
debian/*.Dockerfile

View File

@@ -1,10 +1,28 @@
BINDIR ?= /usr/bin
LIBDIR ?= /usr/lib/x86_64-linux-gnu
QEMU_PLUGINDIR ?= /usr/lib/x86_64-linux-gnu/qemu
BLOCKSTORE_OBJS := allocator.o blockstore.o blockstore_impl.o blockstore_init.o blockstore_open.o blockstore_journal.o blockstore_read.o \ BLOCKSTORE_OBJS := allocator.o blockstore.o blockstore_impl.o blockstore_init.o blockstore_open.o blockstore_journal.o blockstore_read.o \
blockstore_write.o blockstore_sync.o blockstore_stable.o blockstore_rollback.o blockstore_flush.o crc32c.o ringloop.o blockstore_write.o blockstore_sync.o blockstore_stable.o blockstore_rollback.o blockstore_flush.o crc32c.o ringloop.o
# -fsanitize=address # -fsanitize=address
CXXFLAGS := -g -O3 -Wall -Wno-sign-compare -Wno-comment -Wno-parentheses -Wno-pointer-arith -fPIC -fdiagnostics-color=always CXXFLAGS := -g -O3 -Wall -Wno-sign-compare -Wno-comment -Wno-parentheses -Wno-pointer-arith -fPIC -fdiagnostics-color=always -I/usr/include/jerasure
all: libfio_blockstore.so osd libfio_sec_osd.so libfio_cluster.so stub_osd stub_uring_osd stub_bench osd_test dump_journal qemu_driver.so nbd_proxy rm_inode all: libfio_blockstore.so osd libfio_sec_osd.so libfio_cluster.so stub_osd stub_uring_osd stub_bench osd_test dump_journal qemu_driver.so nbd_proxy rm_inode
clean: clean:
rm -f *.o rm -f *.o libblockstore.so libfio_blockstore.so osd libfio_sec_osd.so libfio_cluster.so stub_osd stub_uring_osd stub_bench osd_test dump_journal qemu_driver.so nbd_proxy rm_inode
install: all
mkdir -p $(DESTDIR)$(LIBDIR)/vitastor
install -m 0755 libfio_sec_osd.so $(DESTDIR)$(LIBDIR)/vitastor/
install -m 0755 libfio_cluster.so $(DESTDIR)$(LIBDIR)/vitastor/
install -m 0755 libfio_blockstore.so $(DESTDIR)$(LIBDIR)/vitastor/
install -m 0755 libblockstore.so $(DESTDIR)$(LIBDIR)/vitastor/
mkdir -p $(DESTDIR)$(BINDIR)
install -m 0755 osd $(DESTDIR)$(BINDIR)/vitastor-osd
install -m 0755 dump_journal $(DESTDIR)$(BINDIR)/vitastor-dump-journal
install -m 0755 nbd_proxy $(DESTDIR)$(BINDIR)/vitastor-nbd
install -m 0755 rm_inode $(DESTDIR)$(BINDIR)/vitastor-rm
mkdir -p $(DESTDIR)$(QEMU_PLUGINDIR)
install -m 0755 qemu_driver.so $(DESTDIR)$(QEMU_PLUGINDIR)/block-vitastor.so
dump_journal: dump_journal.cpp crc32c.o blockstore_journal.h dump_journal: dump_journal.cpp crc32c.o blockstore_journal.h
g++ $(CXXFLAGS) -o $@ $< crc32c.o g++ $(CXXFLAGS) -o $@ $< crc32c.o
@@ -12,19 +30,19 @@ dump_journal: dump_journal.cpp crc32c.o blockstore_journal.h
libblockstore.so: $(BLOCKSTORE_OBJS) libblockstore.so: $(BLOCKSTORE_OBJS)
g++ $(CXXFLAGS) -o $@ -shared $(BLOCKSTORE_OBJS) -ltcmalloc_minimal -luring g++ $(CXXFLAGS) -o $@ -shared $(BLOCKSTORE_OBJS) -ltcmalloc_minimal -luring
libfio_blockstore.so: ./libblockstore.so fio_engine.o json11.o libfio_blockstore.so: ./libblockstore.so fio_engine.o json11.o
g++ $(CXXFLAGS) -shared -o $@ fio_engine.o json11.o ./libblockstore.so -ltcmalloc_minimal -luring g++ $(CXXFLAGS) -Wl,-rpath,'$(LIBDIR)/vitastor' -shared -o $@ fio_engine.o json11.o ./libblockstore.so -ltcmalloc_minimal -luring
OSD_OBJS := osd.o osd_secondary.o msgr_receive.o msgr_send.o osd_peering.o osd_flush.o osd_peering_pg.o \ OSD_OBJS := osd.o osd_secondary.o msgr_receive.o msgr_send.o osd_peering.o osd_flush.o osd_peering_pg.o \
osd_primary.o osd_primary_subops.o etcd_state_client.o messenger.o osd_cluster.o http_client.o osd_ops.o pg_states.o \ osd_primary.o osd_primary_subops.o etcd_state_client.o messenger.o osd_cluster.o http_client.o osd_ops.o pg_states.o \
osd_rmw.o json11.o base64.o timerfd_manager.o epoll_manager.o osd_rmw.o json11.o base64.o timerfd_manager.o epoll_manager.o
osd: ./libblockstore.so osd_main.cpp osd.h osd_ops.h $(OSD_OBJS) osd: ./libblockstore.so osd_main.cpp osd.h osd_ops.h $(OSD_OBJS)
g++ $(CXXFLAGS) -o $@ osd_main.cpp $(OSD_OBJS) ./libblockstore.so -ltcmalloc_minimal -luring g++ $(CXXFLAGS) -Wl,-rpath,'$(LIBDIR)/vitastor' -o $@ osd_main.cpp $(OSD_OBJS) ./libblockstore.so -ltcmalloc_minimal -luring -lJerasure
stub_osd: stub_osd.o rw_blocking.o stub_osd: stub_osd.o rw_blocking.o
g++ $(CXXFLAGS) -o $@ stub_osd.o rw_blocking.o -ltcmalloc_minimal g++ $(CXXFLAGS) -o $@ stub_osd.o rw_blocking.o -ltcmalloc_minimal
osd_rmw_test: osd_rmw_test.o osd_rmw_test: osd_rmw_test.o
g++ $(CXXFLAGS) -o $@ osd_rmw_test.o g++ $(CXXFLAGS) -o $@ osd_rmw_test.o -lJerasure -fsanitize=address
STUB_URING_OSD_OBJS := stub_uring_osd.o epoll_manager.o messenger.o msgr_send.o msgr_receive.o ringloop.o timerfd_manager.o json11.o STUB_URING_OSD_OBJS := stub_uring_osd.o epoll_manager.o messenger.o msgr_send.o msgr_receive.o ringloop.o timerfd_manager.o json11.o
stub_uring_osd: $(STUB_URING_OSD_OBJS) stub_uring_osd: $(STUB_URING_OSD_OBJS)
@@ -55,12 +73,12 @@ qemu_driver.o: qemu_driver.c qemu_proxy.h
-I qemu/include $(CXXFLAGS) -c -o $@ $< -I qemu/include $(CXXFLAGS) -c -o $@ $<
qemu_driver.so: qemu_driver.o qemu_proxy.o $(FIO_CLUSTER_OBJS) qemu_driver.so: qemu_driver.o qemu_proxy.o $(FIO_CLUSTER_OBJS)
g++ $(CXXFLAGS) -ltcmalloc_minimal -shared -o $@ $< $(FIO_CLUSTER_OBJS) qemu_driver.o qemu_proxy.o -luring g++ $(CXXFLAGS) -ltcmalloc_minimal -shared -o $@ $(FIO_CLUSTER_OBJS) qemu_driver.o qemu_proxy.o -luring
test_blockstore: ./libblockstore.so test_blockstore.cpp timerfd_interval.o test_blockstore: ./libblockstore.so test_blockstore.cpp timerfd_interval.o
g++ $(CXXFLAGS) -o test_blockstore test_blockstore.cpp timerfd_interval.o ./libblockstore.so -ltcmalloc_minimal -luring g++ $(CXXFLAGS) -Wl,-rpath,'$(LIBDIR)/vitastor' -o test_blockstore test_blockstore.cpp timerfd_interval.o ./libblockstore.so -ltcmalloc_minimal -luring
test: test.cpp osd_peering_pg.o test_shit: test_shit.cpp osd_peering_pg.o
g++ $(CXXFLAGS) -o test test.cpp osd_peering_pg.o -luring -lm g++ $(CXXFLAGS) -o test_shit test_shit.cpp -luring -lm
test_allocator: test_allocator.cpp allocator.o test_allocator: test_allocator.cpp allocator.o
g++ $(CXXFLAGS) -o test_allocator test_allocator.cpp allocator.o g++ $(CXXFLAGS) -o test_allocator test_allocator.cpp allocator.o
@@ -165,12 +183,12 @@ stub_osd.o: stub_osd.cpp object_id.h osd_id.h osd_ops.h rw_blocking.h
g++ $(CXXFLAGS) -c -o $@ $< g++ $(CXXFLAGS) -c -o $@ $<
stub_uring_osd.o: stub_uring_osd.cpp epoll_manager.h json11/json11.hpp malloc_or_die.h messenger.h object_id.h osd_id.h osd_ops.h ringloop.h timerfd_manager.h stub_uring_osd.o: stub_uring_osd.cpp epoll_manager.h json11/json11.hpp malloc_or_die.h messenger.h object_id.h osd_id.h osd_ops.h ringloop.h timerfd_manager.h
g++ $(CXXFLAGS) -c -o $@ $< g++ $(CXXFLAGS) -c -o $@ $<
test.o: test.cpp allocator.h blockstore.h blockstore_flush.h blockstore_impl.h blockstore_init.h blockstore_journal.h cpp-btree/btree_map.h crc32c.h malloc_or_die.h object_id.h osd_id.h osd_ops.h osd_peering_pg.h pg_states.h ringloop.h
g++ $(CXXFLAGS) -c -o $@ $<
test_allocator.o: test_allocator.cpp allocator.h test_allocator.o: test_allocator.cpp allocator.h
g++ $(CXXFLAGS) -c -o $@ $< g++ $(CXXFLAGS) -c -o $@ $<
test_blockstore.o: test_blockstore.cpp blockstore.h object_id.h ringloop.h timerfd_interval.h test_blockstore.o: test_blockstore.cpp blockstore.h object_id.h ringloop.h timerfd_interval.h
g++ $(CXXFLAGS) -c -o $@ $< g++ $(CXXFLAGS) -c -o $@ $<
test_shit.o: test_shit.cpp allocator.h blockstore.h blockstore_flush.h blockstore_impl.h blockstore_init.h blockstore_journal.h cpp-btree/btree_map.h crc32c.h malloc_or_die.h object_id.h osd_id.h osd_ops.h osd_peering_pg.h pg_states.h ringloop.h
g++ $(CXXFLAGS) -c -o $@ $<
timerfd_interval.o: timerfd_interval.cpp ringloop.h timerfd_interval.h timerfd_interval.o: timerfd_interval.cpp ringloop.h timerfd_interval.h
g++ $(CXXFLAGS) -c -o $@ $< g++ $(CXXFLAGS) -c -o $@ $<
timerfd_manager.o: timerfd_manager.cpp timerfd_manager.h timerfd_manager.o: timerfd_manager.cpp timerfd_manager.h

View File

@@ -31,11 +31,11 @@ breaking changes in the future. However, the following is implemented:
- QEMU driver (built out-of-tree) - QEMU driver (built out-of-tree)
- Loadable fio engine for benchmarks (also built out-of-tree) - Loadable fio engine for benchmarks (also built out-of-tree)
- NBD proxy for kernel mounts - NBD proxy for kernel mounts
- Inode removal tool (./rm_inode) - Inode removal tool (vitastor-rm)
- Packaging for Debian and CentOS
## Roadmap ## Roadmap
- Packaging for Debian and, probably, CentOS too
- OSD creation tool (OSDs currently have to be created by hand) - OSD creation tool (OSDs currently have to be created by hand)
- Other administrative tools - Other administrative tools
- Per-inode I/O and space usage statistics - Per-inode I/O and space usage statistics
@@ -280,7 +280,34 @@ Vitastor with single-thread NBD on the same hardware:
- Linear write (4M T1Q128): 1266 MB/s (compared to 2800 MB/s via fio) - Linear write (4M T1Q128): 1266 MB/s (compared to 2800 MB/s via fio)
- Linear read (4M T1Q128): 975 MB/s (compared to 1500 MB/s via fio) - Linear read (4M T1Q128): 975 MB/s (compared to 1500 MB/s via fio)
## Building ## Installation
### Debian
- Trust Vitastor package signing key:
`wget -q -O - https://vitastor.io/debian/pubkey | sudo apt-key add -`
- Add Vitastor package repository to your /etc/apt/sources.list:
- Debian 11 (Bullseye/Sid): `deb https://vitastor.io/debian bullseye main`
- Debian 10 (Buster): `deb https://vitastor.io/debian buster main`
- For Debian 10 (Buster) also enable backports repository:
`deb http://deb.debian.org/debian buster-backports main`
- Install packages: `apt update; apt install vitastor lp-solve etcd linux-image-amd64`
### CentOS
- Add Vitastor package repository:
- CentOS 7: `yum install https://vitastor.io/rpms/centos/7/vitastor-release-1.0-1.el7.noarch.rpm`
- CentOS 8: `dnf install https://vitastor.io/rpms/centos/8/vitastor-release-1.0-1.el8.noarch.rpm`
- Enable EPEL: `yum/dnf install epel-release`
- Enable additional CentOS repositories:
- CentOS 7: `yum install centos-release-scl`
- CentOS 8: `dnf install centos-release-advanced-virtualization`
- Enable elrepo-kernel:
- CentOS 7: `yum install https://www.elrepo.org/elrepo-release-7.el7.elrepo.noarch.rpm`
- CentOS 8: `dnf install https://www.elrepo.org/elrepo-release-8.el8.elrepo.noarch.rpm`
- Install packages: `yum/dnf install vitastor lpsolve etcd kernel-ml qemu-kvm`
### Building from Source
- Install Linux kernel 5.4 or newer, for io_uring support. 5.8 or later is highly recommended because - Install Linux kernel 5.4 or newer, for io_uring support. 5.8 or later is highly recommended because
there is at least one known io_uring hang with 5.4 and an HP SmartArray controller. there is at least one known io_uring hang with 5.4 and an HP SmartArray controller.
@@ -290,10 +317,10 @@ Vitastor with single-thread NBD on the same hardware:
branch release-3.4, because there is a bug in upstream etcd which makes Vitastor OSDs fail to branch release-3.4, because there is a bug in upstream etcd which makes Vitastor OSDs fail to
move PGs out of "starting" state if you have at least around ~500 PGs or so. The custom build move PGs out of "starting" state if you have at least around ~500 PGs or so. The custom build
will be unnecessary when etcd merges the fix: https://github.com/etcd-io/etcd/pull/12402. will be unnecessary when etcd merges the fix: https://github.com/etcd-io/etcd/pull/12402.
- Install node.js 12 or newer. - Install node.js 10 or newer.
- Install gcc and g++ 9.x or later. - Install gcc and g++ 8.x or newer.
- Clone https://yourcmc.ru/git/vitalif/vitastor/ with submodules. - Clone https://yourcmc.ru/git/vitalif/vitastor/ with submodules.
- Install QEMU 4.x or 5.x, get its source, begin to build it, stop the build and copy headers: - Install QEMU 3.0+, get its source, begin to build it, stop the build and copy headers:
- `<qemu>/include` &rarr; `<vitastor>/qemu/include` - `<qemu>/include` &rarr; `<vitastor>/qemu/include`
- Debian: - Debian:
* Use qemu packages from the main repository * Use qemu packages from the main repository
@@ -303,11 +330,15 @@ Vitastor with single-thread NBD on the same hardware:
* Use qemu packages from the Advanced-Virtualization repository. To enable it, run * Use qemu packages from the Advanced-Virtualization repository. To enable it, run
`yum install centos-release-advanced-virtualization.noarch` and then `yum install qemu` `yum install centos-release-advanced-virtualization.noarch` and then `yum install qemu`
* `<qemu>/config-host.h` &rarr; `<vitastor>/qemu/b/qemu/config-host.h` * `<qemu>/config-host.h` &rarr; `<vitastor>/qemu/b/qemu/config-host.h`
* `<qemu>/qapi` &rarr; `<vitastor>/qemu/b/qemu/qapi` * For QEMU 3.0+: `<qemu>/qapi` &rarr; `<vitastor>/qemu/b/qemu/qapi`
* For QEMU 2.0+: `<qemu>/qapi-types.h` &rarr; `<vitastor>/qemu/b/qemu/qapi-types.h`
- `config-host.h` and `qapi` are required because they contain generated headers - `config-host.h` and `qapi` are required because they contain generated headers
- Install fio 3.16 or later, get its source and symlink it into `<vitastor>/fio`. - You can also rebuild QEMU with a patch that makes LD_PRELOAD unnecessary to load vitastor driver.
See `qemu-*.*-vitastor.patch`.
- Install fio 3.7 or later, get its source and symlink it into `<vitastor>/fio`.
- Build Vitastor with `make -j8`. - Build Vitastor with `make -j8`.
- Copy binaries somewhere. - Run `make install` (optionally with `LIBDIR=/usr/lib64 QEMU_PLUGINDIR=/usr/lib64/qemu-kvm`
if you're using an RPM-based distro).
## Running ## Running
@@ -322,8 +353,8 @@ and calculate disk offsets almost by hand. This will be fixed in near future.
- Create global configuration in etcd: `etcdctl --endpoints=... put /vitastor/config/global '{"immediate_commit":"all"}'` - Create global configuration in etcd: `etcdctl --endpoints=... put /vitastor/config/global '{"immediate_commit":"all"}'`
(if all your drives have capacitors). (if all your drives have capacitors).
- Create pool configuration in etcd: `etcdctl --endpoints=... put /vitastor/config/pools '{"1":{"name":"testpool","scheme":"replicated","pg_size":2,"pg_minsize":1,"pg_count":256,"failure_domain":"host"}}'`. - Create pool configuration in etcd: `etcdctl --endpoints=... put /vitastor/config/pools '{"1":{"name":"testpool","scheme":"replicated","pg_size":2,"pg_minsize":1,"pg_count":256,"failure_domain":"host"}}'`.
- Calculate offsets for your drives with `node ./mon/simple-offsets.js --device /dev/sdX`. - Calculate offsets for your drives with `node /usr/lib/vitastor/mon/simple-offsets.js --device /dev/sdX`.
- Make systemd units for your OSDs. Look at `./mon/make-units.sh` for example. - Make systemd units for your OSDs. Look at `/usr/lib/vitastor/mon/make-units.sh` for example.
Notable configuration variables from the example: Notable configuration variables from the example:
- `disable_data_fsync 1` - only safe with server-grade drives with capacitors. - `disable_data_fsync 1` - only safe with server-grade drives with capacitors.
- `immediate_commit all` - use this if all your drives are server-grade. - `immediate_commit all` - use this if all your drives are server-grade.
@@ -342,25 +373,25 @@ and calculate disk offsets almost by hand. This will be fixed in near future.
setting is set, it is also required to raise `journal_sector_buffer_count` setting, which is the setting is set, it is also required to raise `journal_sector_buffer_count` setting, which is the
number of dirty journal sectors that may be written to at the same time. number of dirty journal sectors that may be written to at the same time.
- `systemctl start vitastor.target` everywhere. - `systemctl start vitastor.target` everywhere.
- Start any number of monitors: `cd mon; node mon-main.js --etcd_url 'http://10.115.0.10:2379,http://10.115.0.11:2379,http://10.115.0.12:2379,http://10.115.0.13:2379' --etcd_prefix '/vitastor' --etcd_start_timeout 5`. - Start any number of monitors: `node /usr/lib/vitastor/mon/mon-main.js --etcd_url 'http://10.115.0.10:2379,http://10.115.0.11:2379,http://10.115.0.12:2379,http://10.115.0.13:2379' --etcd_prefix '/vitastor' --etcd_start_timeout 5`.
- At this point, one of the monitors will configure PGs and OSDs will start them. - At this point, one of the monitors will configure PGs and OSDs will start them.
- You can check PG states with `etcdctl --endpoints=... get --prefix /vitastor/pg/state`. All PGs should become 'active'. - You can check PG states with `etcdctl --endpoints=... get --prefix /vitastor/pg/state`. All PGs should become 'active'.
- Run tests with (for example): `fio -thread -ioengine=./libfio_cluster.so -name=test -bs=4M -direct=1 -iodepth=16 -rw=write -etcd=10.115.0.10:2379/v3 -pool=1 -inode=1 -size=400G`. - Run tests with (for example): `fio -thread -ioengine=/usr/lib/x86_64-linux-gnu/vitastor/libfio_cluster.so -name=test -bs=4M -direct=1 -iodepth=16 -rw=write -etcd=10.115.0.10:2379/v3 -pool=1 -inode=1 -size=400G`.
- Upload VM disk image with qemu-img (for example): - Upload VM disk image with qemu-img (for example):
``` ```
LD_PRELOAD=./qemu_driver.so qemu-img convert -f qcow2 debian10.qcow2 -p LD_PRELOAD=/usr/lib/x86_64-linux-gnu/qemu/block-vitastor.so qemu-img convert -f qcow2 debian10.qcow2 -p
-O raw 'vitastor:etcd_host=10.115.0.10\:2379/v3:pool=1:inode=1:size=2147483648' -O raw 'vitastor:etcd_host=10.115.0.10\:2379/v3:pool=1:inode=1:size=2147483648'
``` ```
- Run QEMU with (for example): - Run QEMU with (for example):
``` ```
LD_PRELOAD=./qemu_driver.so qemu-system-x86_64 -enable-kvm -m 1024 LD_PRELOAD=/usr/lib/x86_64-linux-gnu/qemu/block-vitastor.so qemu-system-x86_64 -enable-kvm -m 1024
-drive 'file=vitastor:etcd_host=10.115.0.10\:2379/v3:pool=1:inode=1:size=2147483648',format=raw,if=none,id=drive-virtio-disk0,cache=none -drive 'file=vitastor:etcd_host=10.115.0.10\:2379/v3:pool=1:inode=1:size=2147483648',format=raw,if=none,id=drive-virtio-disk0,cache=none
-device virtio-blk-pci,scsi=off,bus=pci.0,addr=0x5,drive=drive-virtio-disk0,id=virtio-disk0,bootindex=1,write-cache=off,physical_block_size=4096,logical_block_size=512 -device virtio-blk-pci,scsi=off,bus=pci.0,addr=0x5,drive=drive-virtio-disk0,id=virtio-disk0,bootindex=1,write-cache=off,physical_block_size=4096,logical_block_size=512
-vnc 0.0.0.0:0 -vnc 0.0.0.0:0
``` ```
- Remove inode with (for example): - Remove inode with (for example):
``` ```
./rm_inode --etcd_address 10.115.0.10:2379/v3 --pool 1 --inode 1 --parallel_osds 16 --iodepth 32 vitastor-rm --etcd_address 10.115.0.10:2379/v3 --pool 1 --inode 1 --parallel_osds 16 --iodepth 32
``` ```
## Known Problems ## Known Problems

View File

@@ -105,8 +105,8 @@ void journal_flusher_t::unshift_flush(obj_ver_id ov)
else else
{ {
flush_versions[ov.oid] = ov.version; flush_versions[ov.oid] = ov.version;
flush_queue.push_front(ov.oid);
} }
flush_queue.push_front(ov.oid);
if (!dequeuing && (flush_queue.size() >= flusher_start_threshold || trim_wanted > 0)) if (!dequeuing && (flush_queue.size() >= flusher_start_threshold || trim_wanted > 0))
{ {
dequeuing = true; dequeuing = true;

View File

@@ -30,12 +30,13 @@
#define BS_ST_BIG_WRITE 0x02 #define BS_ST_BIG_WRITE 0x02
#define BS_ST_DELETE 0x03 #define BS_ST_DELETE 0x03
#define BS_ST_WAIT_BIG 0x10 #define BS_ST_WAIT_DEL 0x10
#define BS_ST_IN_FLIGHT 0x20 #define BS_ST_WAIT_BIG 0x20
#define BS_ST_SUBMITTED 0x30 #define BS_ST_IN_FLIGHT 0x30
#define BS_ST_WRITTEN 0x40 #define BS_ST_SUBMITTED 0x40
#define BS_ST_SYNCED 0x50 #define BS_ST_WRITTEN 0x50
#define BS_ST_STABLE 0x60 #define BS_ST_SYNCED 0x60
#define BS_ST_STABLE 0x70
#define BS_ST_INSTANT 0x100 #define BS_ST_INSTANT 0x100
@@ -153,6 +154,8 @@ struct blockstore_op_private_t
// Write // Write
struct iovec iov_zerofill[3]; struct iovec iov_zerofill[3];
// Warning: must not have a default value here because it's written to before calling constructor in blockstore_write.cpp O_o
uint64_t real_version;
// Sync // Sync
std::vector<obj_ver_id> sync_big_writes, sync_small_writes; std::vector<obj_ver_id> sync_big_writes, sync_small_writes;

View File

@@ -562,6 +562,45 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
je->big_write.oid.inode, je->big_write.oid.stripe, je->big_write.version, je->big_write.location je->big_write.oid.inode, je->big_write.oid.stripe, je->big_write.version, je->big_write.location
); );
#endif #endif
auto dirty_it = bs->dirty_db.upper_bound((obj_ver_id){
.oid = je->big_write.oid,
.version = UINT64_MAX,
});
if (dirty_it != bs->dirty_db.begin() && bs->dirty_db.size() > 0)
{
dirty_it--;
if (dirty_it->first.oid == je->big_write.oid &&
(dirty_it->second.state & BS_ST_TYPE_MASK) == BS_ST_DELETE)
{
// It is allowed to overwrite a deleted object with a
// version number less than deletion version number,
// because the presence of a BIG_WRITE entry means that
// the data for it is already on disk.
// Purge all dirty and clean entries for this object.
auto dirty_end = dirty_it;
dirty_end++;
while (1)
{
if (dirty_it == bs->dirty_db.begin())
{
break;
}
dirty_it--;
if (dirty_it->first.oid != je->big_write.oid)
{
dirty_it++;
break;
}
}
bs->erase_dirty(dirty_it, dirty_end, UINT64_MAX);
auto clean_it = bs->clean_db.find(je->big_write.oid);
if (clean_it != bs->clean_db.end())
{
bs->data_alloc->set(clean_it->second.location >> bs->block_order, false);
bs->clean_db.erase(clean_it);
}
}
}
auto clean_it = bs->clean_db.find(je->big_write.oid); auto clean_it = bs->clean_db.find(je->big_write.oid);
if (clean_it == bs->clean_db.end() || if (clean_it == bs->clean_db.end() ||
clean_it->second.version < je->big_write.version) clean_it->second.version < je->big_write.version)

View File

@@ -234,10 +234,35 @@ void blockstore_impl_t::handle_rollback_event(ring_data_t *data, blockstore_op_t
void blockstore_impl_t::erase_dirty(blockstore_dirty_db_t::iterator dirty_start, blockstore_dirty_db_t::iterator dirty_end, uint64_t clean_loc) void blockstore_impl_t::erase_dirty(blockstore_dirty_db_t::iterator dirty_start, blockstore_dirty_db_t::iterator dirty_end, uint64_t clean_loc)
{ {
auto dirty_it = dirty_end; if (dirty_end == dirty_start)
while (dirty_it != dirty_start)
{ {
return;
}
auto dirty_it = dirty_end;
dirty_it--;
if (IS_DELETE(dirty_it->second.state))
{
object_id oid = dirty_it->first.oid;
dirty_it = dirty_end;
// Unblock operations blocked by delete flushing
uint32_t next_state = BS_ST_IN_FLIGHT;
while (dirty_it != dirty_db.end() && dirty_it->first.oid == oid)
{
if ((dirty_it->second.state & BS_ST_WORKFLOW_MASK) == BS_ST_WAIT_DEL)
{
dirty_it->second.state = (dirty_it->second.state & ~BS_ST_WORKFLOW_MASK) | next_state;
if (IS_BIG_WRITE(dirty_it->second.state))
{
next_state = BS_ST_WAIT_BIG;
}
}
dirty_it++;
}
dirty_it = dirty_end;
dirty_it--; dirty_it--;
}
while (1)
{
if (IS_BIG_WRITE(dirty_it->second.state) && dirty_it->second.location != clean_loc) if (IS_BIG_WRITE(dirty_it->second.state) && dirty_it->second.location != clean_loc)
{ {
#ifdef BLOCKSTORE_DEBUG #ifdef BLOCKSTORE_DEBUG
@@ -256,6 +281,11 @@ void blockstore_impl_t::erase_dirty(blockstore_dirty_db_t::iterator dirty_start,
{ {
journal.used_sectors.erase(dirty_it->second.journal_sector); journal.used_sectors.erase(dirty_it->second.journal_sector);
} }
if (dirty_it == dirty_start)
{
break;
}
dirty_it--;
} }
dirty_db.erase(dirty_start, dirty_end); dirty_db.erase(dirty_start, dirty_end);
} }

View File

@@ -7,7 +7,7 @@ bool blockstore_impl_t::enqueue_write(blockstore_op_t *op)
{ {
// Check or assign version number // Check or assign version number
bool found = false, deleted = false, is_del = (op->opcode == BS_OP_DELETE); bool found = false, deleted = false, is_del = (op->opcode == BS_OP_DELETE);
bool is_inflight_big = false; bool wait_big = false, wait_del = false;
uint64_t version = 1; uint64_t version = 1;
if (dirty_db.size() > 0) if (dirty_db.size() > 0)
{ {
@@ -21,7 +21,8 @@ bool blockstore_impl_t::enqueue_write(blockstore_op_t *op)
found = true; found = true;
version = dirty_it->first.version + 1; version = dirty_it->first.version + 1;
deleted = IS_DELETE(dirty_it->second.state); deleted = IS_DELETE(dirty_it->second.state);
is_inflight_big = (dirty_it->second.state & BS_ST_TYPE_MASK) == BS_ST_BIG_WRITE wait_del = ((dirty_it->second.state & BS_ST_WORKFLOW_MASK) == BS_ST_WAIT_DEL);
wait_big = (dirty_it->second.state & BS_ST_TYPE_MASK) == BS_ST_BIG_WRITE
? !IS_SYNCED(dirty_it->second.state) ? !IS_SYNCED(dirty_it->second.state)
: ((dirty_it->second.state & BS_ST_WORKFLOW_MASK) == BS_ST_WAIT_BIG); : ((dirty_it->second.state & BS_ST_WORKFLOW_MASK) == BS_ST_WAIT_BIG);
} }
@@ -38,23 +39,40 @@ bool blockstore_impl_t::enqueue_write(blockstore_op_t *op)
deleted = true; deleted = true;
} }
} }
if (op->version == 0)
{
op->version = version;
}
else if (op->version < version)
{
// Invalid version requested
op->retval = -EEXIST;
return false;
}
if (deleted && is_del) if (deleted && is_del)
{ {
// Already deleted // Already deleted
op->retval = 0; op->retval = 0;
return false; return false;
} }
if (is_inflight_big && !is_del && !deleted && op->len < block_size && PRIV(op)->real_version = 0;
if (op->version == 0)
{
op->version = version;
}
else if (op->version < version)
{
// Implicit operations must be added like that: DEL [FLUSH] BIG [SYNC] SMALL SMALL
if (deleted || wait_del)
{
// It's allowed to write versions with low numbers over deletes
// However, we have to flush those deletes first as we use version number for ordering
wait_del = true;
PRIV(op)->real_version = op->version;
op->version = version;
flusher->unshift_flush((obj_ver_id){
.oid = op->oid,
.version = version-1,
});
}
else
{
// Invalid version requested
op->retval = -EEXIST;
return false;
}
}
if (wait_big && !is_del && !deleted && op->len < block_size &&
immediate_commit != IMMEDIATE_ALL) immediate_commit != IMMEDIATE_ALL)
{ {
// Issue an additional sync so that the previous big write can reach the journal // Issue an additional sync so that the previous big write can reach the journal
@@ -72,19 +90,28 @@ bool blockstore_impl_t::enqueue_write(blockstore_op_t *op)
else else
printf("Write %lx:%lx v%lu offset=%u len=%u\n", op->oid.inode, op->oid.stripe, op->version, op->offset, op->len); printf("Write %lx:%lx v%lu offset=%u len=%u\n", op->oid.inode, op->oid.stripe, op->version, op->offset, op->len);
#endif #endif
// No strict need to add it into dirty_db here, it's just left // FIXME No strict need to add it into dirty_db here, it's just left
// from the previous implementation where reads waited for writes // from the previous implementation where reads waited for writes
uint32_t state;
if (is_del)
state = BS_ST_DELETE | BS_ST_IN_FLIGHT;
else
{
state = (op->len == block_size || deleted ? BS_ST_BIG_WRITE : BS_ST_SMALL_WRITE);
if (wait_del)
state |= BS_ST_WAIT_DEL;
else if (state == BS_ST_SMALL_WRITE && wait_big)
state |= BS_ST_WAIT_BIG;
else
state |= BS_ST_IN_FLIGHT;
if (op->opcode == BS_OP_WRITE_STABLE)
state |= BS_ST_INSTANT;
}
dirty_db.emplace((obj_ver_id){ dirty_db.emplace((obj_ver_id){
.oid = op->oid, .oid = op->oid,
.version = op->version, .version = op->version,
}, (dirty_entry){ }, (dirty_entry){
.state = (uint32_t)( .state = state,
is_del
? (BS_ST_DELETE | BS_ST_IN_FLIGHT)
: (op->opcode == BS_OP_WRITE_STABLE ? BS_ST_INSTANT : 0) | (op->len == block_size || deleted
? (BS_ST_BIG_WRITE | BS_ST_IN_FLIGHT)
: (is_inflight_big ? (BS_ST_SMALL_WRITE | BS_ST_WAIT_BIG) : (BS_ST_SMALL_WRITE | BS_ST_IN_FLIGHT)))
),
.flags = 0, .flags = 0,
.location = 0, .location = 0,
.offset = is_del ? 0 : op->offset, .offset = is_del ? 0 : op->offset,
@@ -106,12 +133,35 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
.version = op->version, .version = op->version,
}); });
assert(dirty_it != dirty_db.end()); assert(dirty_it != dirty_db.end());
if ((dirty_it->second.state & BS_ST_WORKFLOW_MASK) == BS_ST_WAIT_BIG) if ((dirty_it->second.state & BS_ST_WORKFLOW_MASK) < BS_ST_IN_FLIGHT)
{ {
// Don't dequeue // Don't dequeue
return 0; return 0;
} }
else if ((dirty_it->second.state & BS_ST_TYPE_MASK) == BS_ST_BIG_WRITE) if (PRIV(op)->real_version != 0)
{
// Restore original low version number for unblocked operations
auto prev_it = dirty_it;
prev_it--;
if (prev_it->first.oid == op->oid && prev_it->first.version >= PRIV(op)->real_version)
{
// Original version is still invalid
// FIXME Oops. Successive small writes will currently break in an unexpected way. Fix it
dirty_db.erase(dirty_it);
op->retval = -EEXIST;
FINISH_OP(op);
return 1;
}
op->version = PRIV(op)->real_version;
PRIV(op)->real_version = 0;
dirty_entry e = dirty_it->second;
dirty_db.erase(dirty_it);
dirty_it = dirty_db.emplace((obj_ver_id){
.oid = op->oid,
.version = op->version,
}, e).first;
}
if ((dirty_it->second.state & BS_ST_TYPE_MASK) == BS_ST_BIG_WRITE)
{ {
blockstore_journal_check_t space_check(this); blockstore_journal_check_t space_check(this);
if (!space_check.check_available(op, unsynced_big_writes.size() + 1, sizeof(journal_entry_big_write), JOURNAL_STABILIZE_RESERVATION)) if (!space_check.check_available(op, unsynced_big_writes.size() + 1, sizeof(journal_entry_big_write), JOURNAL_STABILIZE_RESERVATION))
@@ -129,6 +179,8 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
PRIV(op)->wait_for = WAIT_FREE; PRIV(op)->wait_for = WAIT_FREE;
return 0; return 0;
} }
// FIXME Oops. Successive small writes will currently break in an unexpected way. Fix it
dirty_db.erase(dirty_it);
op->retval = -ENOSPC; op->retval = -ENOSPC;
FINISH_OP(op); FINISH_OP(op);
return 1; return 1;
@@ -492,7 +544,10 @@ int blockstore_impl_t::dequeue_del(blockstore_op_t *op)
prepare_journal_sector_write(journal, journal.cur_sector, sqe, cb); prepare_journal_sector_write(journal, journal.cur_sector, sqe, cb);
PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 1 + journal.cur_sector; PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 1 + journal.cur_sector;
PRIV(op)->pending_ops++; PRIV(op)->pending_ops++;
// Remember small write as unsynced }
else
{
// Remember delete as unsynced
unsynced_small_writes.push_back((obj_ver_id){ unsynced_small_writes.push_back((obj_ver_id){
.oid = op->oid, .oid = op->oid,
.version = op->version, .version = op->version,

View File

@@ -488,7 +488,7 @@ void cluster_client_t::slice_rw(cluster_op_t *op)
uint64_t begin = (op->offset < stripe ? stripe : op->offset); uint64_t begin = (op->offset < stripe ? stripe : op->offset);
uint64_t end = (op->offset + op->len) > (stripe + pg_block_size) uint64_t end = (op->offset + op->len) > (stripe + pg_block_size)
? (stripe + pg_block_size) : (op->offset + op->len); ? (stripe + pg_block_size) : (op->offset + op->len);
op->parts[i] = { op->parts[i] = (cluster_op_part_t){
.parent = op, .parent = op,
.offset = begin, .offset = begin,
.len = (uint32_t)(end - begin), .len = (uint32_t)(end - begin),
@@ -533,7 +533,7 @@ bool cluster_client_t::try_send(cluster_op_t *op, cluster_op_part_t *part)
part->osd_num = primary_osd; part->osd_num = primary_osd;
part->sent = true; part->sent = true;
op->sent_count++; op->sent_count++;
part->op = { part->op = (osd_op_t){
.op_type = OSD_OP_OUT, .op_type = OSD_OP_OUT,
.peer_fd = peer_fd, .peer_fd = peer_fd,
.req = { .rw = { .req = { .rw = {
@@ -694,7 +694,7 @@ void cluster_client_t::send_sync(cluster_op_t *op, cluster_op_part_t *part)
assert(peer_it != msgr.osd_peer_fds.end()); assert(peer_it != msgr.osd_peer_fds.end());
part->sent = true; part->sent = true;
op->sent_count++; op->sent_count++;
part->op = { part->op = (osd_op_t){
.op_type = OSD_OP_OUT, .op_type = OSD_OP_OUT,
.peer_fd = peer_it->second, .peer_fd = peer_it->second,
.req = { .req = {

13
copy-fio-includes.sh Executable file
View File

@@ -0,0 +1,13 @@
#!/bin/bash
gcc -E -o fio_headers.i fio_headers.h
rm -rf fio-copy
for i in `grep -Po 'fio/[^"]+' fio_headers.i | sort | uniq`; do
j=${i##fio/}
p=$(dirname $j)
mkdir -p fio-copy/$p
cp $i fio-copy/$j
done
rm fio_headers.i

18
copy-qemu-includes.sh Executable file
View File

@@ -0,0 +1,18 @@
#!/bin/bash
#cd qemu
#debian/rules b/configure-stamp
#cd b/qemu; make qapi
gcc -I qemu/b/qemu `pkg-config glib-2.0 --cflags` \
-I qemu/include -E -o qemu_driver.i qemu_driver.c
rm -rf qemu-copy
for i in `grep -Po 'qemu/[^"]+' qemu_driver.i | sort | uniq`; do
j=${i##qemu/}
p=$(dirname $j)
mkdir -p qemu-copy/$p
cp $i qemu-copy/$j
done
rm qemu_driver.i

5
debian/changelog vendored Normal file
View File

@@ -0,0 +1,5 @@
vitastor (0.5-1) unstable; urgency=medium
* First packaging for Debian
-- Vitaliy Filippov <vitalif@yourcmc.ru> Thu, 05 Nov 2020 02:20:59 +0300

1
debian/compat vendored Normal file
View File

@@ -0,0 +1 @@
13

17
debian/control vendored Normal file
View File

@@ -0,0 +1,17 @@
Source: vitastor
Section: admin
Priority: optional
Maintainer: Vitaliy Filippov <vitalif@yourcmc.ru>
Build-Depends: debhelper, liburing-dev (>= 0.6), g++ (>= 8), libstdc++6 (>= 8), linux-libc-dev, libgoogle-perftools-dev
Standards-Version: 4.5.0
Homepage: https://vitastor.io/
Rules-Requires-Root: no
Package: vitastor
Architecture: any
Depends: ${shlibs:Depends}, ${misc:Depends}, fio (= ${dep:fio}), qemu (= ${dep:qemu}), nodejs (>= 12), node-sprintf-js, node-ws (>= 7)
Description: Vitastor, a fast software-defined clustered block storage
Vitastor is a small, simple and fast clustered block storage (storage for VM drives),
architecturally similar to Ceph which means strong consistency, primary-replication,
symmetric clustering and automatic data distribution over any number of drives of any
size with configurable redundancy (replication or erasure codes/XOR).

20
debian/copyright vendored Normal file
View File

@@ -0,0 +1,20 @@
Format: https://www.debian.org/doc/packaging-manuals/copyright-format/1.0/
Upstream-Name: vitastor
Upstream-Contact: Vitaliy Filippov <vitalif@yourcmc.ru>
Source: https://vitastor.io
Files: *
Copyright: 2019+ Vitaliy Filippov <vitalif@yourcmc.ru>
License: Multiple licenses VNPL-1.0 and/or GPL-2.0+
All server-side code (OSD, Monitor and so on) is licensed under the terms of
Vitastor Network Public License 1.0 (VNPL 1.0), a copyleft license based on
GNU GPLv3.0 with the additional "Network Interaction" clause which requires
opensourcing all programs directly or indirectly interacting with Vitastor
through a computer network ("Proxy Programs"). Proxy Programs may be made public
not only under the terms of the same license, but also under the terms of any
GPL-Compatible Free Software License, as listed by the Free Software Foundation.
This is a stricter copyleft license than the Affero GPL.
.
Client libraries (cluster_client and so on) are dual-licensed under the same
VNPL 1.0 and also GNU GPL 2.0 or later to allow for compatibility with GPLed
software like QEMU and fio.

3
debian/install vendored Normal file
View File

@@ -0,0 +1,3 @@
VNPL-1.0.txt usr/share/doc/vitastor
GPL-2.0.txt usr/share/doc/vitastor
mon usr/lib/vitastor

9
debian/rules vendored Executable file
View File

@@ -0,0 +1,9 @@
#!/usr/bin/make -f
export DH_VERBOSE = 1
%:
dh $@
override_dh_installdeb:
cat debian/substvars >> debian/vitastor.substvars
dh_installdeb

1
debian/source/format vendored Normal file
View File

@@ -0,0 +1 @@
3.0 (quilt)

2
debian/substvars vendored Normal file
View File

@@ -0,0 +1,2 @@
dep:fio=3.16-1
dep:qemu=1:5.1+dfsg-4+vitastor1

86
debian/vitastor-bullseye.Dockerfile vendored Normal file
View File

@@ -0,0 +1,86 @@
# Build packages for Debian Bullseye/Sid inside a container
# cd ..; podman build -t vitastor-bullseye -v `pwd`/build:/root/build -f debian/vitastor-bullseye.Dockerfile .
ARG REL=bullseye
FROM debian:$REL
# again, it doesn't work otherwise
ARG REL=bullseye
WORKDIR /root
RUN grep '^deb ' /etc/apt/sources.list | perl -pe 's/^deb/deb-src/' >> /etc/apt/sources.list; \
echo 'APT::Install-Recommends false;' > /etc/apt/apt.conf
RUN apt-get update
RUN apt-get -y install qemu fio liburing1 liburing-dev libgoogle-perftools-dev devscripts
RUN apt-get -y build-dep qemu
RUN apt-get -y build-dep fio
RUN apt-get --download-only source qemu
RUN apt-get --download-only source fio
ADD qemu-5.0-vitastor.patch qemu-5.1-vitastor.patch /root/vitastor/
RUN set -e; \
mkdir -p /root/build/qemu-$REL; \
rm -rf /root/build/qemu-$REL/*; \
cd /root/build/qemu-$REL; \
dpkg-source -x /root/qemu*.dsc; \
if [ -d /root/build/qemu-$REL/qemu-5.0 ]; then \
cp /root/vitastor/qemu-5.0-vitastor.patch /root/build/qemu-$REL/qemu-5.0/debian/patches; \
echo qemu-5.0-vitastor.patch >> /root/build/qemu-$REL/qemu-5.0/debian/patches/series; \
else \
cp /root/vitastor/qemu-5.1-vitastor.patch /root/build/qemu-$REL/qemu-*/debian/patches; \
P=`ls -d /root/build/qemu-$REL/qemu-*/debian/patches`; \
echo qemu-5.1-vitastor.patch >> $P/series; \
fi; \
cd /root/build/qemu-$REL/qemu-*/; \
V=$(head -n1 debian/changelog | perl -pe 's/^.*\((.*?)(~bpo[\d\+]*)?\).*$/$1/')+vitastor1; \
echo ">>> VERSION: $V"; \
DEBFULLNAME="Vitaliy Filippov <vitalif@yourcmc.ru>" dch -D $REL -v $V 'Plug Vitastor block driver'; \
DEB_BUILD_OPTIONS=nocheck dpkg-buildpackage --jobs=auto -sa; \
rm -rf /root/build/qemu-$REL/qemu-*/
RUN cd /root/build/qemu-$REL && apt-get -y install ./qemu-system-data*.deb ./qemu-system-common_*.deb ./qemu-system-x86_*.deb ./qemu_*.deb
ADD . /root/vitastor
RUN set -e -x; \
mkdir -p /root/fio-build/; \
cd /root/fio-build/; \
rm -rf /root/fio-build/*; \
dpkg-source -x /root/fio*.dsc; \
cd /root/build/qemu-$REL/; \
rm -rf qemu*/; \
dpkg-source -x qemu*.dsc; \
cd /root/build/qemu-$REL/qemu*/; \
debian/rules b/configure-stamp; \
cd b/qemu; \
make -j8 qapi; \
mkdir -p /root/build/vitastor-$REL; \
rm -rf /root/build/vitastor-$REL/*; \
cd /root/build/vitastor-$REL; \
cp -r /root/vitastor vitastor-0.5; \
ln -s /root/build/qemu-$REL/qemu-*/ vitastor-0.5/qemu; \
ln -s /root/fio-build/fio-*/ vitastor-0.5/fio; \
cd vitastor-0.5; \
FIO=$(head -n1 fio/debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
QEMU=$(head -n1 qemu/debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
sh copy-qemu-includes.sh; \
sh copy-fio-includes.sh; \
rm qemu fio; \
mkdir -p a b debian/patches; \
mv qemu-copy b/qemu; \
mv fio-copy b/fio; \
diff -NaurpbB a b > debian/patches/qemu-fio-headers.patch || true; \
echo qemu-fio-headers.patch >> debian/patches/series; \
rm -rf a b; \
rm -rf /root/build/qemu-$REL/qemu*/; \
echo "dep:fio=$FIO" > debian/substvars; \
echo "dep:qemu=$QEMU" >> debian/substvars; \
cd /root/build/vitastor-$REL; \
tar --sort=name --mtime='2020-01-01' --owner=0 --group=0 --exclude=debian -cJf vitastor_0.5.orig.tar.xz vitastor-0.5; \
cd vitastor-0.5; \
V=$(head -n1 debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
DEBFULLNAME="Vitaliy Filippov <vitalif@yourcmc.ru>" dch -D $REL -v "$V""$REL" "Rebuild for $REL"; \
DEB_BUILD_OPTIONS=nocheck dpkg-buildpackage --jobs=auto -sa; \
rm -rf /root/build/vitastor-$REL/vitastor-*/

80
debian/vitastor-buster.Dockerfile vendored Normal file
View File

@@ -0,0 +1,80 @@
# Build packages for Debian 10 inside a container
# cd ..; podman build -t vitastor-buster -v `pwd`/build:/root/build -f debian/vitastor-buster.Dockerfile .
FROM debian:buster
WORKDIR /root
RUN echo 'deb http://deb.debian.org/debian buster-backports main' >> /etc/apt/sources.list; \
grep '^deb ' /etc/apt/sources.list | perl -pe 's/^deb/deb-src/' >> /etc/apt/sources.list; \
echo 'APT::Install-Recommends false;' > /etc/apt/apt.conf
RUN apt-get update
RUN apt-get -t buster-backports -y install qemu fio liburing1 liburing-dev libgoogle-perftools-dev devscripts
RUN apt-get -t buster-backports -y build-dep qemu
RUN apt-get -y build-dep fio
RUN apt-get -t buster-backports --download-only source qemu-kvm
RUN apt-get --download-only source fio
ADD qemu-5.0-vitastor.patch qemu-5.1-vitastor.patch /root/vitastor/
RUN set -e; \
mkdir -p /root/build/qemu-buster; \
rm -rf /root/build/qemu-buster/*; \
cd /root/build/qemu-buster; \
dpkg-source -x /root/qemu*.dsc; \
if [ -d /root/build/qemu-buster/qemu-5.0 ]; then \
cp /root/vitastor/qemu-5.0-vitastor.patch /root/build/qemu-buster/qemu-5.0/debian/patches; \
echo qemu-5.0-vitastor.patch >> /root/build/qemu-buster/qemu-5.0/debian/patches/series; \
else \
cp /root/vitastor/qemu-5.1-vitastor.patch /root/build/qemu-buster/qemu-*/debian/patches; \
echo qemu-5.1-vitastor.patch >> /root/build/qemu-buster/qemu-*/debian/patches/series; \
fi; \
cd /root/build/qemu-buster/qemu-*/; \
V=$(head -n1 debian/changelog | perl -pe 's/^.*\((.*?)(~bpo[\d\+]*)\).*$/$1/')+vitastor1; \
DEBFULLNAME="Vitaliy Filippov <vitalif@yourcmc.ru>" dch -D buster -v $V 'Plug Vitastor block driver'; \
DEB_BUILD_OPTIONS=nocheck dpkg-buildpackage --jobs=auto -sa; \
rm -rf /root/build/qemu-buster/qemu-*/
RUN cd /root/build/qemu-buster && apt-get -y -t buster-backports install ./qemu-system-data*.deb ./qemu-system-common_*.deb ./qemu-system-x86_*.deb ./qemu_*.deb
ADD . /root/vitastor
RUN set -e -x; \
mkdir -p /root/fio-build/; \
cd /root/fio-build/; \
rm -rf /root/fio-build/*; \
dpkg-source -x /root/fio*.dsc; \
cd /root/build/qemu-buster/; \
rm -rf qemu*/; \
dpkg-source -x qemu*.dsc; \
cd /root/build/qemu-buster/qemu*/; \
debian/rules b/configure-stamp; \
cd b/qemu; \
make -j8 qapi; \
mkdir -p /root/build/vitastor-buster; \
rm -rf /root/build/vitastor-buster/*; \
cd /root/build/vitastor-buster; \
cp -r /root/vitastor vitastor-0.5; \
ln -s /root/build/qemu-buster/qemu-*/ vitastor-0.5/qemu; \
ln -s /root/fio-build/fio-*/ vitastor-0.5/fio; \
cd vitastor-0.5; \
FIO=$(head -n1 fio/debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
QEMU=$(head -n1 qemu/debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
sh copy-qemu-includes.sh; \
sh copy-fio-includes.sh; \
rm qemu fio; \
mkdir -p a b debian/patches; \
mv qemu-copy b/qemu; \
mv fio-copy b/fio; \
diff -NaurpbB a b > debian/patches/qemu-fio-headers.patch || true; \
echo qemu-fio-headers.patch >> debian/patches/series; \
rm -rf a b; \
rm -rf /root/build/qemu-buster/qemu*/; \
echo "dep:fio=$FIO" > debian/substvars; \
echo "dep:qemu=$QEMU" >> debian/substvars; \
cd /root/build/vitastor-buster; \
tar --sort=name --mtime='2020-01-01' --owner=0 --group=0 --exclude=debian -cJf vitastor_0.5.orig.tar.xz vitastor-0.5; \
cd vitastor-0.5; \
V=$(head -n1 debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
DEBFULLNAME="Vitaliy Filippov <vitalif@yourcmc.ru>" dch -D buster -v "$V""buster" "Rebuild for buster"; \
DEB_BUILD_OPTIONS=nocheck dpkg-buildpackage --jobs=auto -sa; \
rm -rf /root/build/vitastor-buster/vitastor-*/

View File

@@ -173,6 +173,7 @@ void etcd_state_client_t::start_etcd_watcher()
{ "range_end", base64_encode(etcd_prefix+"/config0") }, { "range_end", base64_encode(etcd_prefix+"/config0") },
{ "start_revision", etcd_watch_revision+1 }, { "start_revision", etcd_watch_revision+1 },
{ "watch_id", ETCD_CONFIG_WATCH_ID }, { "watch_id", ETCD_CONFIG_WATCH_ID },
{ "progress_notify", true },
} } } }
}).dump()); }).dump());
etcd_watch_ws->post_message(WS_TEXT, json11::Json(json11::Json::object { etcd_watch_ws->post_message(WS_TEXT, json11::Json(json11::Json::object {
@@ -181,6 +182,7 @@ void etcd_state_client_t::start_etcd_watcher()
{ "range_end", base64_encode(etcd_prefix+"/osd/state0") }, { "range_end", base64_encode(etcd_prefix+"/osd/state0") },
{ "start_revision", etcd_watch_revision+1 }, { "start_revision", etcd_watch_revision+1 },
{ "watch_id", ETCD_OSD_STATE_WATCH_ID }, { "watch_id", ETCD_OSD_STATE_WATCH_ID },
{ "progress_notify", true },
} } } }
}).dump()); }).dump());
etcd_watch_ws->post_message(WS_TEXT, json11::Json(json11::Json::object { etcd_watch_ws->post_message(WS_TEXT, json11::Json(json11::Json::object {
@@ -189,6 +191,7 @@ void etcd_state_client_t::start_etcd_watcher()
{ "range_end", base64_encode(etcd_prefix+"/pg/state0") }, { "range_end", base64_encode(etcd_prefix+"/pg/state0") },
{ "start_revision", etcd_watch_revision+1 }, { "start_revision", etcd_watch_revision+1 },
{ "watch_id", ETCD_PG_STATE_WATCH_ID }, { "watch_id", ETCD_PG_STATE_WATCH_ID },
{ "progress_notify", true },
} } } }
}).dump()); }).dump());
etcd_watch_ws->post_message(WS_TEXT, json11::Json(json11::Json::object { etcd_watch_ws->post_message(WS_TEXT, json11::Json(json11::Json::object {
@@ -197,6 +200,7 @@ void etcd_state_client_t::start_etcd_watcher()
{ "range_end", base64_encode(etcd_prefix+"/pg/history0") }, { "range_end", base64_encode(etcd_prefix+"/pg/history0") },
{ "start_revision", etcd_watch_revision+1 }, { "start_revision", etcd_watch_revision+1 },
{ "watch_id", ETCD_PG_HISTORY_WATCH_ID }, { "watch_id", ETCD_PG_HISTORY_WATCH_ID },
{ "progress_notify", true },
} } } }
}).dump()); }).dump());
} }
@@ -315,67 +319,98 @@ void etcd_state_client_t::parse_state(const std::string & key, const json11::Jso
} }
for (auto & pool_item: value.object_items()) for (auto & pool_item: value.object_items())
{ {
pool_config_t pc;
// ID
pool_id_t pool_id = stoull_full(pool_item.first); pool_id_t pool_id = stoull_full(pool_item.first);
if (!pool_id || pool_id >= POOL_ID_MAX) if (!pool_id || pool_id >= POOL_ID_MAX)
{ {
printf("Pool ID %s is invalid (must be a number less than 0x%x), skipping pool\n", pool_item.first.c_str(), POOL_ID_MAX); printf("Pool ID %s is invalid (must be a number less than 0x%x), skipping pool\n", pool_item.first.c_str(), POOL_ID_MAX);
continue; continue;
} }
if (pool_item.second["pg_size"].uint64_value() < 1 || pc.id = pool_id;
pool_item.second["scheme"] == "xor" && pool_item.second["pg_size"].uint64_value() < 3) // Pool Name
{ pc.name = pool_item.second["name"].string_value();
printf("Pool %u has invalid pg_size, skipping pool\n", pool_id); if (pc.name == "")
continue;
}
if (pool_item.second["pg_minsize"].uint64_value() < 1 ||
pool_item.second["pg_minsize"].uint64_value() > pool_item.second["pg_size"].uint64_value() ||
pool_item.second["pg_minsize"].uint64_value() < (pool_item.second["pg_size"].uint64_value() - 1))
{
printf("Pool %u has invalid pg_minsize, skipping pool\n", pool_id);
continue;
}
if (pool_item.second["pg_count"].uint64_value() < 1)
{
printf("Pool %u has invalid pg_count, skipping pool\n", pool_id);
continue;
}
if (pool_item.second["name"].string_value() == "")
{ {
printf("Pool %u has empty name, skipping pool\n", pool_id); printf("Pool %u has empty name, skipping pool\n", pool_id);
continue; continue;
} }
if (pool_item.second["scheme"] != "replicated" && pool_item.second["scheme"] != "xor") // Failure Domain
pc.failure_domain = pool_item.second["failure_domain"].string_value();
// Coding Scheme
if (pool_item.second["scheme"] == "replicated")
pc.scheme = POOL_SCHEME_REPLICATED;
else if (pool_item.second["scheme"] == "xor")
pc.scheme = POOL_SCHEME_XOR;
else if (pool_item.second["scheme"] == "jerasure")
pc.scheme = POOL_SCHEME_JERASURE;
else
{ {
printf("Pool %u has invalid coding scheme (only \"xor\" and \"replicated\" are allowed), skipping pool\n", pool_id); printf("Pool %u has invalid coding scheme (one of \"xor\", \"replicated\" or \"jerasure\" required), skipping pool\n", pool_id);
continue; continue;
} }
if (pool_item.second["max_osd_combinations"].uint64_value() > 0 && // PG Size
pool_item.second["max_osd_combinations"].uint64_value() < 100) pc.pg_size = pool_item.second["pg_size"].uint64_value();
if (pc.pg_size < 1 ||
pool_item.second["pg_size"].uint64_value() < 3 &&
(pc.scheme == POOL_SCHEME_XOR || pc.scheme == POOL_SCHEME_JERASURE) ||
pool_item.second["pg_size"].uint64_value() > 256)
{
printf("Pool %u has invalid pg_size, skipping pool\n", pool_id);
continue;
}
// Parity Chunks
pc.parity_chunks = pool_item.second["parity_chunks"].uint64_value();
if (pc.scheme == POOL_SCHEME_XOR)
{
if (pc.parity_chunks > 1)
{
printf("Pool %u has invalid parity_chunks (must be 1), skipping pool\n", pool_id);
continue;
}
pc.parity_chunks = 1;
}
if (pc.scheme == POOL_SCHEME_JERASURE &&
(pc.parity_chunks < 1 || pc.parity_chunks > pc.pg_size-2))
{
printf("Pool %u has invalid parity_chunks (must be between 1 and pg_size-2), skipping pool\n", pool_id);
continue;
}
// PG MinSize
pc.pg_minsize = pool_item.second["pg_minsize"].uint64_value();
if (pc.pg_minsize < 1 || pc.pg_minsize > pc.pg_size ||
(pc.scheme == POOL_SCHEME_XOR || pc.scheme == POOL_SCHEME_JERASURE) &&
pc.pg_minsize < (pc.pg_size-pc.parity_chunks))
{
printf("Pool %u has invalid pg_minsize, skipping pool\n", pool_id);
continue;
}
// PG Count
pc.pg_count = pool_item.second["pg_count"].uint64_value();
if (pc.pg_count < 1)
{
printf("Pool %u has invalid pg_count, skipping pool\n", pool_id);
continue;
}
// Max OSD Combinations
pc.max_osd_combinations = pool_item.second["max_osd_combinations"].uint64_value();
if (!pc.max_osd_combinations)
pc.max_osd_combinations = 10000;
if (pc.max_osd_combinations > 0 && pc.max_osd_combinations < 100)
{ {
printf("Pool %u has invalid max_osd_combinations (must be at least 100), skipping pool\n", pool_id); printf("Pool %u has invalid max_osd_combinations (must be at least 100), skipping pool\n", pool_id);
continue; continue;
} }
// PG Stripe Size
pc.pg_stripe_size = pool_item.second["pg_stripe_size"].uint64_value();
uint64_t min_stripe_size = bs_block_size * (pc.scheme == POOL_SCHEME_REPLICATED ? 1 : (pc.pg_size-pc.parity_chunks));
if (pc.pg_stripe_size < min_stripe_size)
pc.pg_stripe_size = min_stripe_size;
// Save
std::swap(pc.pg_config, this->pool_config[pool_id].pg_config);
std::swap(this->pool_config[pool_id], pc);
auto & parsed_cfg = this->pool_config[pool_id]; auto & parsed_cfg = this->pool_config[pool_id];
parsed_cfg.exists = true; parsed_cfg.exists = true;
parsed_cfg.id = pool_id;
parsed_cfg.name = pool_item.second["name"].string_value();
parsed_cfg.scheme = pool_item.second["scheme"] == "replicated" ? POOL_SCHEME_REPLICATED : POOL_SCHEME_XOR;
parsed_cfg.pg_size = pool_item.second["pg_size"].uint64_value();
parsed_cfg.pg_minsize = pool_item.second["pg_minsize"].uint64_value();
parsed_cfg.pg_count = pool_item.second["pg_count"].uint64_value();
parsed_cfg.failure_domain = pool_item.second["failure_domain"].string_value();
parsed_cfg.pg_stripe_size = pool_item.second["pg_stripe_size"].uint64_value();
uint64_t min_stripe_size = bs_block_size *
(parsed_cfg.scheme == POOL_SCHEME_REPLICATED ? 1 : parsed_cfg.pg_minsize);
if (parsed_cfg.pg_stripe_size < min_stripe_size)
{
parsed_cfg.pg_stripe_size = min_stripe_size;
}
parsed_cfg.max_osd_combinations = pool_item.second["max_osd_combinations"].uint64_value();
if (!parsed_cfg.max_osd_combinations)
{
parsed_cfg.max_osd_combinations = 10000;
}
for (auto & pg_item: parsed_cfg.pg_config) for (auto & pg_item: parsed_cfg.pg_config)
{ {
if (pg_item.second.target_set.size() != parsed_cfg.pg_size) if (pg_item.second.target_set.size() != parsed_cfg.pg_size)

View File

@@ -43,7 +43,7 @@ struct pool_config_t
pool_id_t id; pool_id_t id;
std::string name; std::string name;
uint64_t scheme; uint64_t scheme;
uint64_t pg_size, pg_minsize; uint64_t pg_size, pg_minsize, parity_chunks;
uint64_t pg_count; uint64_t pg_count;
uint64_t real_pg_count; uint64_t real_pg_count;
std::string failure_domain; std::string failure_domain;

View File

@@ -116,7 +116,7 @@ void osd_messenger_t::try_connect_peer_addr(osd_num_t peer_osd, const char *peer
return; return;
}); });
} }
clients[peer_fd] = new osd_client_t({ clients[peer_fd] = new osd_client_t((osd_client_t){
.peer_addr = addr, .peer_addr = addr,
.peer_port = peer_port, .peer_port = peer_port,
.peer_fd = peer_fd, .peer_fd = peer_fd,
@@ -233,7 +233,7 @@ void osd_messenger_t::check_peer_config(osd_client_t *cl)
osd_op_t *op = new osd_op_t(); osd_op_t *op = new osd_op_t();
op->op_type = OSD_OP_OUT; op->op_type = OSD_OP_OUT;
op->peer_fd = cl->peer_fd; op->peer_fd = cl->peer_fd;
op->req = { op->req = (osd_any_op_t){
.show_conf = { .show_conf = {
.header = { .header = {
.magic = SECONDARY_OSD_OP_MAGIC, .magic = SECONDARY_OSD_OP_MAGIC,
@@ -402,7 +402,7 @@ void osd_messenger_t::accept_connections(int listen_fd)
fcntl(peer_fd, F_SETFL, fcntl(peer_fd, F_GETFL, 0) | O_NONBLOCK); fcntl(peer_fd, F_SETFL, fcntl(peer_fd, F_GETFL, 0) | O_NONBLOCK);
int one = 1; int one = 1;
setsockopt(peer_fd, SOL_TCP, TCP_NODELAY, &one, sizeof(one)); setsockopt(peer_fd, SOL_TCP, TCP_NODELAY, &one, sizeof(one));
clients[peer_fd] = new osd_client_t({ clients[peer_fd] = new osd_client_t((osd_client_t){
.peer_addr = addr, .peer_addr = addr,
.peer_port = ntohs(addr.sin_port), .peer_port = ntohs(addr.sin_port),
.peer_fd = peer_fd, .peer_fd = peer_fd,

0
mon/mon-main.js Normal file → Executable file
View File

View File

@@ -9,212 +9,215 @@ const LPOptimizer = require('./lp-optimizer.js');
const stableStringify = require('./stable-stringify.js'); const stableStringify = require('./stable-stringify.js');
const PGUtil = require('./PGUtil.js'); const PGUtil = require('./PGUtil.js');
// FIXME document all etcd keys and config variables in the form of JSON schema or similar
const etcd_allow = new RegExp('^'+[
'config/global',
'config/node_placement',
'config/pools',
'config/osd/[1-9]\\d*',
'config/pgs',
'osd/state/[1-9]\\d*',
'osd/stats/[1-9]\\d*',
'mon/master',
'pg/state/[1-9]\\d*/[1-9]\\d*',
'pg/stats/[1-9]\\d*/[1-9]\\d*',
'pg/history/[1-9]\\d*/[1-9]\\d*',
'stats',
].join('$|^')+'$');
const etcd_tree = {
config: {
/* global: {
// mon
etcd_mon_ttl: 30, // min: 10
etcd_mon_timeout: 1000, // ms. min: 0
etcd_mon_retries: 5, // min: 0
mon_change_timeout: 1000, // ms. min: 100
mon_stats_timeout: 1000, // ms. min: 100
osd_out_time: 1800, // seconds. min: 0
placement_levels: { datacenter: 1, rack: 2, host: 3, osd: 4, ... },
// client and osd
use_sync_send_recv: false,
log_level: 0,
block_size: 131072,
disk_alignment: 4096,
bitmap_granularity: 4096,
immediate_commit: false, // 'all' or 'small'
client_dirty_limit: 33554432,
peer_connect_interval: 5, // seconds. min: 1
peer_connect_timeout: 5, // seconds. min: 1
up_wait_retry_interval: 500, // ms. min: 50
// osd
etcd_report_interval: 30, // min: 10
run_primary: true,
bind_address: "0.0.0.0",
bind_port: 0,
autosync_interval: 5,
client_queue_depth: 128, // unused
recovery_queue_depth: 4,
readonly: false,
print_stats_interval: 3,
// blockstore - fixed in superblock
block_size,
disk_alignment,
journal_block_size,
meta_block_size,
bitmap_granularity,
journal_device,
journal_offset,
journal_size,
disable_journal_fsync,
data_device,
data_offset,
data_size,
disable_data_fsync,
meta_device,
meta_offset,
disable_meta_fsync,
disable_device_lock,
// blockstore - configurable
flusher_count,
inmemory_metadata,
inmemory_journal,
journal_sector_buffer_count,
journal_no_same_sector_overwrites,
}, */
global: {},
/* node_placement: {
host1: { level: 'host', parent: 'rack1' },
...
}, */
node_placement: {},
/* pools: {
<id>: {
name: 'testpool',
// jerasure uses Reed-Solomon-Vandermonde codes
scheme: 'replicated' | 'xor' | 'jerasure',
pg_size: 3,
pg_minsize: 2,
// number of parity chunks, required for jerasure
parity_chunks?: 1,
pg_count: 100,
failure_domain: 'host',
max_osd_combinations: 10000,
pg_stripe_size: 4194304,
root_node?: 'rack1',
// restrict pool to OSDs having all of these tags
osd_tags?: 'nvme' | [ 'nvme', ... ],
},
...
}, */
pools: {},
osd: {
/* <id>: { reweight?: 1, tags?: [ 'nvme', ... ] }, ... */
},
/* pgs: {
hash: string,
items: {
<pool_id>: {
<pg_id>: {
osd_set: [ 1, 2, 3 ],
primary: 1,
pause: false,
}
}
}
}, */
pgs: {},
},
osd: {
state: {
/* <osd_num_t>: {
state: "up",
addresses: string[],
host: string,
port: uint16_t,
primary_enabled: boolean,
blockstore_enabled: boolean,
}, */
},
stats: {
/* <osd_num_t>: {
time: number, // unix time
blockstore_ready: boolean,
size: uint64_t, // bytes
free: uint64_t, // bytes
host: string,
op_stats: {
<string>: { count: uint64_t, usec: uint64_t, bytes: uint64_t },
},
subop_stats: {
<string>: { count: uint64_t, usec: uint64_t },
},
recovery_stats: {
degraded: { count: uint64_t, bytes: uint64_t },
misplaced: { count: uint64_t, bytes: uint64_t },
},
}, */
},
},
mon: {
master: {
/* ip: [ string ], */
},
},
pg: {
state: {
/* <pool_id>: {
<pg_id>: {
primary: osd_num_t,
state: ("starting"|"peering"|"incomplete"|"active"|"stopping"|"offline"|
"degraded"|"has_incomplete"|"has_degraded"|"has_misplaced"|"has_unclean"|
"has_invalid"|"left_on_dead")[],
}
}, */
},
stats: {
/* <pool_id>: {
<pg_id>: {
object_count: uint64_t,
clean_count: uint64_t,
misplaced_count: uint64_t,
degraded_count: uint64_t,
incomplete_count: uint64_t,
write_osd_set: osd_num_t[],
},
}, */
},
history: {
/* <pool_id>: {
<pg_id>: {
osd_sets: osd_num_t[][],
all_peers: osd_num_t[],
epoch: uint32_t,
},
}, */
},
},
stats: {
/* op_stats: {
<string>: { count: uint64_t, usec: uint64_t, bytes: uint64_t },
},
subop_stats: {
<string>: { count: uint64_t, usec: uint64_t },
},
recovery_stats: {
degraded: { count: uint64_t, bytes: uint64_t },
misplaced: { count: uint64_t, bytes: uint64_t },
},
object_counts: {
object: uint64_t,
clean: uint64_t,
misplaced: uint64_t,
degraded: uint64_t,
incomplete: uint64_t,
}, */
},
};
// FIXME Split into several files // FIXME Split into several files
class Mon class Mon
{ {
// FIXME document all etcd keys and config variables in the form of JSON schema or similar
static etcd_allow = new RegExp('^'+[
'config/global',
'config/node_placement',
'config/pools',
'config/osd/[1-9]\\d*',
'config/pgs',
'osd/state/[1-9]\\d*',
'osd/stats/[1-9]\\d*',
'mon/master',
'pg/state/[1-9]\\d*/[1-9]\\d*',
'pg/stats/[1-9]\\d*/[1-9]\\d*',
'pg/history/[1-9]\\d*/[1-9]\\d*',
'stats',
].join('$|^')+'$')
static etcd_tree = {
config: {
/* global: {
// mon
etcd_mon_ttl: 30, // min: 10
etcd_mon_timeout: 1000, // ms. min: 0
etcd_mon_retries: 5, // min: 0
mon_change_timeout: 1000, // ms. min: 100
mon_stats_timeout: 1000, // ms. min: 100
osd_out_time: 1800, // seconds. min: 0
placement_levels: { datacenter: 1, rack: 2, host: 3, osd: 4, ... },
// client and osd
use_sync_send_recv: false,
log_level: 0,
block_size: 131072,
disk_alignment: 4096,
bitmap_granularity: 4096,
immediate_commit: false, // 'all' or 'small'
client_dirty_limit: 33554432,
peer_connect_interval: 5, // seconds. min: 1
peer_connect_timeout: 5, // seconds. min: 1
up_wait_retry_interval: 500, // ms. min: 50
// osd
etcd_report_interval: 30, // min: 10
run_primary: true,
bind_address: "0.0.0.0",
bind_port: 0,
autosync_interval: 5,
client_queue_depth: 128, // unused
recovery_queue_depth: 4,
readonly: false,
print_stats_interval: 3,
// blockstore - fixed in superblock
block_size,
disk_alignment,
journal_block_size,
meta_block_size,
bitmap_granularity,
journal_device,
journal_offset,
journal_size,
disable_journal_fsync,
data_device,
data_offset,
data_size,
disable_data_fsync,
meta_device,
meta_offset,
disable_meta_fsync,
disable_device_lock,
// blockstore - configurable
flusher_count,
inmemory_metadata,
inmemory_journal,
journal_sector_buffer_count,
journal_no_same_sector_overwrites,
}, */
global: {},
/* node_placement: {
host1: { level: 'host', parent: 'rack1' },
...
}, */
node_placement: {},
/* pools: {
<id>: {
name: 'testpool',
scheme: 'xor',
pg_size: 3,
pg_minsize: 2,
pg_count: 100,
failure_domain: 'host',
max_osd_combinations: 10000,
pg_stripe_size: 4194304,
root_node?: 'rack1',
// restrict pool to OSDs having all of these tags
osd_tags?: 'nvme' | [ 'nvme', ... ],
},
...
}, */
pools: {},
osd: {
/* <id>: { reweight?: 1, tags?: [ 'nvme', ... ] }, ... */
},
/* pgs: {
hash: string,
items: {
<pool_id>: {
<pg_id>: {
osd_set: [ 1, 2, 3 ],
primary: 1,
pause: false,
}
}
}
}, */
pgs: {},
},
osd: {
state: {
/* <osd_num_t>: {
state: "up",
addresses: string[],
host: string,
port: uint16_t,
primary_enabled: boolean,
blockstore_enabled: boolean,
}, */
},
stats: {
/* <osd_num_t>: {
time: number, // unix time
blockstore_ready: boolean,
size: uint64_t, // bytes
free: uint64_t, // bytes
host: string,
op_stats: {
<string>: { count: uint64_t, usec: uint64_t, bytes: uint64_t },
},
subop_stats: {
<string>: { count: uint64_t, usec: uint64_t },
},
recovery_stats: {
degraded: { count: uint64_t, bytes: uint64_t },
misplaced: { count: uint64_t, bytes: uint64_t },
},
}, */
},
},
mon: {
master: {
/* ip: [ string ], */
},
},
pg: {
state: {
/* <pool_id>: {
<pg_id>: {
primary: osd_num_t,
state: ("starting"|"peering"|"incomplete"|"active"|"stopping"|"offline"|
"degraded"|"has_incomplete"|"has_degraded"|"has_misplaced"|"has_unclean"|
"has_invalid"|"left_on_dead")[],
}
}, */
},
stats: {
/* <pool_id>: {
<pg_id>: {
object_count: uint64_t,
clean_count: uint64_t,
misplaced_count: uint64_t,
degraded_count: uint64_t,
incomplete_count: uint64_t,
write_osd_set: osd_num_t[],
},
}, */
},
history: {
/* <pool_id>: {
<pg_id>: {
osd_sets: osd_num_t[][],
all_peers: osd_num_t[],
epoch: uint32_t,
},
}, */
},
},
stats: {
/* op_stats: {
<string>: { count: uint64_t, usec: uint64_t, bytes: uint64_t },
},
subop_stats: {
<string>: { count: uint64_t, usec: uint64_t },
},
recovery_stats: {
degraded: { count: uint64_t, bytes: uint64_t },
misplaced: { count: uint64_t, bytes: uint64_t },
},
object_counts: {
object: uint64_t,
clean: uint64_t,
misplaced: uint64_t,
degraded: uint64_t,
incomplete: uint64_t,
}, */
},
}
constructor(config) constructor(config)
{ {
// FIXME: Maybe prefer local etcd // FIXME: Maybe prefer local etcd
@@ -331,6 +334,7 @@ class Mon
range_end: b64(this.etcd_prefix+'0'), range_end: b64(this.etcd_prefix+'0'),
start_revision: ''+this.etcd_watch_revision, start_revision: ''+this.etcd_watch_revision,
watch_id: 1, watch_id: 1,
progress_notify: true,
}, },
})); }));
this.ws.on('message', (msg) => this.ws.on('message', (msg) =>
@@ -561,19 +565,15 @@ class Mon
{ requestPut: { key: b64(this.etcd_prefix+'/config/pgs'), value: b64(JSON.stringify(new_cfg)) } }, { requestPut: { key: b64(this.etcd_prefix+'/config/pgs'), value: b64(JSON.stringify(new_cfg)) } },
], ],
}, this.config.etcd_mon_timeout, 0); }, this.config.etcd_mon_timeout, 0);
if (!res.succeeded) return false;
{
return false;
}
this.state.config.pgs = new_cfg;
} }
return !has_online; return !has_online;
} }
save_new_pgs_txn(request, pool_id, up_osds, prev_pgs, new_pgs, pg_history) save_new_pgs_txn(request, pool_id, up_osds, prev_pgs, new_pgs, pg_history)
{ {
const replicated = this.state.config.pools[pool_id].scheme === 'replicated'; const replicated = new_pgs.length && this.state.config.pools[pool_id].scheme === 'replicated';
const pg_minsize = this.state.config.pools[pool_id].pg_minsize; const pg_minsize = new_pgs.length && this.state.config.pools[pool_id].pg_minsize;
const pg_items = {}; const pg_items = {};
new_pgs.map((osd_set, i) => new_pgs.map((osd_set, i) =>
{ {
@@ -628,13 +628,21 @@ class Mon
} }
} }
this.state.config.pgs.items = this.state.config.pgs.items || {}; this.state.config.pgs.items = this.state.config.pgs.items || {};
this.state.config.pgs.items[pool_id] = pg_items; if (!new_pgs.length)
{
delete this.state.config.pgs.items[pool_id];
}
else
{
this.state.config.pgs.items[pool_id] = pg_items;
}
} }
validate_pool_cfg(pool_id, pool_cfg, warn) validate_pool_cfg(pool_id, pool_cfg, warn)
{ {
pool_cfg.pg_size = Math.floor(pool_cfg.pg_size); pool_cfg.pg_size = Math.floor(pool_cfg.pg_size);
pool_cfg.pg_minsize = Math.floor(pool_cfg.pg_minsize); pool_cfg.pg_minsize = Math.floor(pool_cfg.pg_minsize);
pool_cfg.parity_chunks = Math.floor(pool_cfg.parity_chunks) || undefined;
pool_cfg.pg_count = Math.floor(pool_cfg.pg_count); pool_cfg.pg_count = Math.floor(pool_cfg.pg_count);
pool_cfg.failure_domain = pool_cfg.failure_domain || 'host'; pool_cfg.failure_domain = pool_cfg.failure_domain || 'host';
pool_cfg.max_osd_combinations = Math.floor(pool_cfg.max_osd_combinations) || 10000; pool_cfg.max_osd_combinations = Math.floor(pool_cfg.max_osd_combinations) || 10000;
@@ -644,8 +652,14 @@ class Mon
console.log('Pool ID '+pool_id+' is invalid'); console.log('Pool ID '+pool_id+' is invalid');
return false; return false;
} }
if (!pool_cfg.pg_size || pool_cfg.pg_size < 1 || if (pool_cfg.scheme !== 'xor' && pool_cfg.scheme !== 'replicated' && pool_cfg.scheme !== 'jerasure')
pool_cfg.scheme === 'xor' && pool_cfg.pg_size < 3) {
if (warn)
console.log('Pool '+pool_id+' has invalid coding scheme (one of "xor", "replicated" and "jerasure" required)');
return false;
}
if (!pool_cfg.pg_size || pool_cfg.pg_size < 1 || pool_cfg.pg_size > 256 ||
(pool_cfg.scheme === 'xor' || pool_cfg.scheme == 'jerasure') && pool_cfg.pg_size < 3)
{ {
if (warn) if (warn)
console.log('Pool '+pool_id+' has invalid pg_size'); console.log('Pool '+pool_id+' has invalid pg_size');
@@ -658,6 +672,18 @@ class Mon
console.log('Pool '+pool_id+' has invalid pg_minsize'); console.log('Pool '+pool_id+' has invalid pg_minsize');
return false; return false;
} }
if (pool_cfg.scheme === 'xor' && pool_cfg.parity_chunks != 0 && pool_cfg.parity_chunks != 1)
{
if (warn)
console.log('Pool '+pool_id+' has invalid parity_chunks (must be 1)');
return false;
}
if (pool_cfg.scheme === 'jerasure' && (pool_cfg.parity_chunks < 1 || pool_cfg.parity_chunks > pool_cfg.pg_size-2))
{
if (warn)
console.log('Pool '+pool_id+' has invalid parity_chunks (must be between 1 and pg_size-2)');
return false;
}
if (!pool_cfg.pg_count || pool_cfg.pg_count < 1) if (!pool_cfg.pg_count || pool_cfg.pg_count < 1)
{ {
if (warn) if (warn)
@@ -670,12 +696,6 @@ class Mon
console.log('Pool '+pool_id+' has empty name'); console.log('Pool '+pool_id+' has empty name');
return false; return false;
} }
if (pool_cfg.scheme !== 'xor' && pool_cfg.scheme !== 'replicated')
{
if (warn)
console.log('Pool '+pool_id+' has invalid coding scheme (only "xor" and "replicated" are allowed)');
return false;
}
if (pool_cfg.max_osd_combinations < 100) if (pool_cfg.max_osd_combinations < 100)
{ {
if (warn) if (warn)
@@ -739,6 +759,24 @@ class Mon
{ {
// Something has changed // Something has changed
const etcd_request = { compare: [], success: [] }; const etcd_request = { compare: [], success: [] };
for (const pool_id in (this.state.config.pgs||{}).items||{})
{
if (!this.state.config.pools[pool_id])
{
// Pool deleted. Delete all PGs, but first stop them.
if (!await this.stop_all_pgs(pool_id))
{
this.schedule_recheck();
return;
}
const prev_pgs = [];
for (const pg in this.state.config.pgs.items[pool_id]||{})
{
prev_pgs[pg-1] = this.state.config.pgs.items[pool_id][pg].osd_set;
}
this.save_new_pgs_txn(etcd_request, pool_id, up_osds, prev_pgs, [], []);
}
}
for (const pool_id in this.state.config.pools) for (const pool_id in this.state.config.pools)
{ {
const pool_cfg = this.state.config.pools[pool_id]; const pool_cfg = this.state.config.pools[pool_id];
@@ -1228,4 +1266,7 @@ function sha1hex(str)
return hash.digest('hex'); return hash.digest('hex');
} }
Mon.etcd_allow = etcd_allow;
Mon.etcd_tree = etcd_tree;
module.exports = Mon; module.exports = Mon;

View File

@@ -17,6 +17,10 @@
#include "epoll_manager.h" #include "epoll_manager.h"
#include "cluster_client.h" #include "cluster_client.h"
#ifndef MSG_ZEROCOPY
#define MSG_ZEROCOPY 0
#endif
const char *exe_name = NULL; const char *exe_name = NULL;
class nbd_proxy class nbd_proxy

View File

@@ -4,6 +4,7 @@
#include "osd.h" #include "osd.h"
#include "base64.h" #include "base64.h"
#include "etcd_state_client.h" #include "etcd_state_client.h"
#include "osd_rmw.h"
// Startup sequence: // Startup sequence:
// Start etcd watcher -> Load global OSD configuration -> Bind socket -> Acquire lease -> Report&lock OSD state // Start etcd watcher -> Load global OSD configuration -> Bind socket -> Acquire lease -> Report&lock OSD state
@@ -32,12 +33,26 @@ void osd_t::init_cluster()
} }
pgs[{ 1, 1 }] = (pg_t){ pgs[{ 1, 1 }] = (pg_t){
.state = PG_PEERING, .state = PG_PEERING,
.scheme = POOL_SCHEME_XOR,
.pg_cursize = 0, .pg_cursize = 0,
.pg_size = 3,
.pg_minsize = 2,
.parity_chunks = 1,
.pool_id = 1, .pool_id = 1,
.pg_num = 1, .pg_num = 1,
.target_set = { 1, 2, 3 }, .target_set = { 1, 2, 3 },
.cur_set = { 0, 0, 0 }, .cur_set = { 0, 0, 0 },
}; };
st_cli.pool_config[1] = (pool_config_t){
.exists = true,
.id = 1,
.name = "testpool",
.scheme = POOL_SCHEME_XOR,
.pg_size = 3,
.pg_minsize = 2,
.pg_count = 1,
.real_pg_count = 1,
};
report_pg_state(pgs[{ 1, 1 }]); report_pg_state(pgs[{ 1, 1 }]);
pg_counts[1] = 1; pg_counts[1] = 1;
} }
@@ -583,6 +598,7 @@ void osd_t::apply_pg_config()
.pg_cursize = 0, .pg_cursize = 0,
.pg_size = pool_item.second.pg_size, .pg_size = pool_item.second.pg_size,
.pg_minsize = pool_item.second.pg_minsize, .pg_minsize = pool_item.second.pg_minsize,
.parity_chunks = pool_item.second.parity_chunks,
.pool_id = pool_id, .pool_id = pool_id,
.pg_num = pg_num, .pg_num = pg_num,
.reported_epoch = pg_cfg.epoch, .reported_epoch = pg_cfg.epoch,
@@ -590,6 +606,10 @@ void osd_t::apply_pg_config()
.all_peers = std::vector<osd_num_t>(all_peers.begin(), all_peers.end()), .all_peers = std::vector<osd_num_t>(all_peers.begin(), all_peers.end()),
.target_set = pg_cfg.target_set, .target_set = pg_cfg.target_set,
}; };
if (pg.scheme == POOL_SCHEME_JERASURE)
{
use_jerasure(pg.pg_size, pg.pg_size-pg.parity_chunks, true);
}
this->pg_state_dirty.insert({ .pool_id = pool_id, .pg_num = pg_num }); this->pg_state_dirty.insert({ .pool_id = pool_id, .pg_num = pg_num });
pg.print_state(); pg.print_state();
if (pg_cfg.cur_primary == this->osd_num) if (pg_cfg.cur_primary == this->osd_num)
@@ -778,6 +798,10 @@ void osd_t::report_pg_states()
{ {
// Remove offline PGs after reporting their state // Remove offline PGs after reporting their state
this->pgs.erase(pg_it); this->pgs.erase(pg_it);
if (pg_it->second.scheme == POOL_SCHEME_JERASURE)
{
use_jerasure(pg_it->second.pg_size, pg_it->second.pg_size-pg_it->second.parity_chunks, false);
}
} }
} }
} }

View File

@@ -166,7 +166,7 @@ void osd_t::submit_flush_op(pool_id_t pool_id, pg_num_t pg_num, pg_flush_batch_t
{ {
// local // local
clock_gettime(CLOCK_REALTIME, &op->tv_begin); clock_gettime(CLOCK_REALTIME, &op->tv_begin);
op->bs_op = new blockstore_op_t({ op->bs_op = new blockstore_op_t((blockstore_op_t){
.opcode = (uint64_t)(rollback ? BS_OP_ROLLBACK : BS_OP_STABLE), .opcode = (uint64_t)(rollback ? BS_OP_ROLLBACK : BS_OP_STABLE),
.callback = [this, op, pool_id, pg_num, fb](blockstore_op_t *bs_op) .callback = [this, op, pool_id, pg_num, fb](blockstore_op_t *bs_op)
{ {
@@ -188,7 +188,7 @@ void osd_t::submit_flush_op(pool_id_t pool_id, pg_num_t pg_num, pg_flush_batch_t
op->op_type = OSD_OP_OUT; op->op_type = OSD_OP_OUT;
op->iov.push_back(op->buf, count * sizeof(obj_ver_id)); op->iov.push_back(op->buf, count * sizeof(obj_ver_id));
op->peer_fd = peer_fd; op->peer_fd = peer_fd;
op->req = { op->req = (osd_any_op_t){
.sec_stab = { .sec_stab = {
.header = { .header = {
.magic = SECONDARY_OSD_OP_MAGIC, .magic = SECONDARY_OSD_OP_MAGIC,
@@ -246,7 +246,7 @@ void osd_t::submit_recovery_op(osd_recovery_op_t *op)
{ {
op->osd_op = new osd_op_t(); op->osd_op = new osd_op_t();
op->osd_op->op_type = OSD_OP_OUT; op->osd_op->op_type = OSD_OP_OUT;
op->osd_op->req = { op->osd_op->req = (osd_any_op_t){
.rw = { .rw = {
.header = { .header = {
.magic = SECONDARY_OSD_OP_MAGIC, .magic = SECONDARY_OSD_OP_MAGIC,

View File

@@ -5,6 +5,7 @@
#define POOL_SCHEME_REPLICATED 1 #define POOL_SCHEME_REPLICATED 1
#define POOL_SCHEME_XOR 2 #define POOL_SCHEME_XOR 2
#define POOL_SCHEME_JERASURE 3
#define POOL_ID_MAX 0x10000 #define POOL_ID_MAX 0x10000
#define POOL_ID_BITS 16 #define POOL_ID_BITS 16
#define INODE_POOL(inode) (pool_id_t)((inode) >> (64 - POOL_ID_BITS)) #define INODE_POOL(inode) (pool_id_t)((inode) >> (64 - POOL_ID_BITS))

View File

@@ -307,7 +307,7 @@ void osd_t::submit_sync_and_list_subop(osd_num_t role_osd, pg_peering_state_t *p
osd_op_t *op = new osd_op_t(); osd_op_t *op = new osd_op_t();
op->op_type = OSD_OP_OUT; op->op_type = OSD_OP_OUT;
op->peer_fd = cl->peer_fd; op->peer_fd = cl->peer_fd;
op->req = { op->req = (osd_any_op_t){
.sec_sync = { .sec_sync = {
.header = { .header = {
.magic = SECONDARY_OSD_OP_MAGIC, .magic = SECONDARY_OSD_OP_MAGIC,
@@ -382,7 +382,7 @@ void osd_t::submit_list_subop(osd_num_t role_osd, pg_peering_state_t *ps)
osd_op_t *op = new osd_op_t(); osd_op_t *op = new osd_op_t();
op->op_type = OSD_OP_OUT; op->op_type = OSD_OP_OUT;
op->peer_fd = c_cli.osd_peer_fds[role_osd]; op->peer_fd = c_cli.osd_peer_fds[role_osd];
op->req = { op->req = (osd_any_op_t){
.sec_list = { .sec_list = {
.header = { .header = {
.magic = SECONDARY_OSD_OP_MAGIC, .magic = SECONDARY_OSD_OP_MAGIC,

View File

@@ -75,7 +75,7 @@ struct pg_t
{ {
int state = 0; int state = 0;
uint64_t scheme = 0; uint64_t scheme = 0;
uint64_t pg_cursize = 0, pg_size = 0, pg_minsize = 0; uint64_t pg_cursize = 0, pg_size = 0, pg_minsize = 0, parity_chunks = 0;
pool_id_t pool_id = 0; pool_id_t pool_id = 0;
pg_num_t pg_num = 0; pg_num_t pg_num = 0;
uint64_t clean_count = 0, total_count = 0; uint64_t clean_count = 0, total_count = 0;

View File

@@ -16,8 +16,9 @@ bool osd_t::prepare_primary_rw(osd_op_t *cur_op)
{ {
// PG number is calculated from the offset // PG number is calculated from the offset
// Our EC scheme stores data in fixed chunks equal to (K*block size) // Our EC scheme stores data in fixed chunks equal to (K*block size)
// K = pg_minsize in case of EC/XOR, or 1 for replicated pools // K = (pg_size-parity_chunks) in case of EC/XOR, or 1 for replicated pools
pool_id_t pool_id = INODE_POOL(cur_op->req.rw.inode); pool_id_t pool_id = INODE_POOL(cur_op->req.rw.inode);
// FIXME: We have to access pool config here, so make sure that it doesn't change while its PGs are active...
auto pool_cfg_it = st_cli.pool_config.find(pool_id); auto pool_cfg_it = st_cli.pool_config.find(pool_id);
if (pool_cfg_it == st_cli.pool_config.end()) if (pool_cfg_it == st_cli.pool_config.end())
{ {
@@ -26,7 +27,8 @@ bool osd_t::prepare_primary_rw(osd_op_t *cur_op)
return false; return false;
} }
auto & pool_cfg = pool_cfg_it->second; auto & pool_cfg = pool_cfg_it->second;
uint64_t pg_block_size = bs_block_size * (pool_cfg.scheme == POOL_SCHEME_REPLICATED ? 1 : pool_cfg.pg_minsize); uint64_t pg_data_size = (pool_cfg.scheme == POOL_SCHEME_REPLICATED ? 1 : pool_cfg.pg_size-pool_cfg.parity_chunks);
uint64_t pg_block_size = bs_block_size * pg_data_size;
object_id oid = { object_id oid = {
.inode = cur_op->req.rw.inode, .inode = cur_op->req.rw.inode,
// oid.stripe = starting offset of the parity stripe // oid.stripe = starting offset of the parity stripe
@@ -37,6 +39,7 @@ bool osd_t::prepare_primary_rw(osd_op_t *cur_op)
if (pg_it == pgs.end() || !(pg_it->second.state & PG_ACTIVE)) if (pg_it == pgs.end() || !(pg_it->second.state & PG_ACTIVE))
{ {
// This OSD is not primary for this PG or the PG is inactive // This OSD is not primary for this PG or the PG is inactive
// FIXME: Allow reads from PGs degraded under pg_minsize, but don't allow writes
finish_op(cur_op, -EPIPE); finish_op(cur_op, -EPIPE);
return false; return false;
} }
@@ -54,9 +57,9 @@ bool osd_t::prepare_primary_rw(osd_op_t *cur_op)
op_data->oid = oid; op_data->oid = oid;
op_data->stripes = ((osd_rmw_stripe_t*)(op_data+1)); op_data->stripes = ((osd_rmw_stripe_t*)(op_data+1));
op_data->scheme = pool_cfg.scheme; op_data->scheme = pool_cfg.scheme;
op_data->pg_data_size = pg_data_size;
cur_op->op_data = op_data; cur_op->op_data = op_data;
split_stripes((pool_cfg.scheme == POOL_SCHEME_REPLICATED ? 1 : pg_it->second.pg_minsize), split_stripes(pg_data_size, bs_block_size, (uint32_t)(cur_op->req.rw.offset - oid.stripe), cur_op->req.rw.len, op_data->stripes);
bs_block_size, (uint32_t)(cur_op->req.rw.offset - oid.stripe), cur_op->req.rw.len, op_data->stripes);
pg_it->second.inflight++; pg_it->second.inflight++;
return true; return true;
} }
@@ -101,7 +104,7 @@ void osd_t::continue_primary_read(osd_op_t *cur_op)
else if (op_data->st == 2) goto resume_2; else if (op_data->st == 2) goto resume_2;
{ {
auto & pg = pgs[{ .pool_id = INODE_POOL(op_data->oid.inode), .pg_num = op_data->pg_num }]; auto & pg = pgs[{ .pool_id = INODE_POOL(op_data->oid.inode), .pg_num = op_data->pg_num }];
for (int role = 0; role < (op_data->scheme == POOL_SCHEME_REPLICATED ? 1 : pg.pg_minsize); role++) for (int role = 0; role < op_data->pg_data_size; role++)
{ {
op_data->stripes[role].read_start = op_data->stripes[role].req_start; op_data->stripes[role].read_start = op_data->stripes[role].req_start;
op_data->stripes[role].read_end = op_data->stripes[role].req_end; op_data->stripes[role].read_end = op_data->stripes[role].req_end;
@@ -112,24 +115,23 @@ void osd_t::continue_primary_read(osd_op_t *cur_op)
if (pg.state == PG_ACTIVE || op_data->scheme == POOL_SCHEME_REPLICATED) if (pg.state == PG_ACTIVE || op_data->scheme == POOL_SCHEME_REPLICATED)
{ {
// Fast happy-path // Fast happy-path
cur_op->buf = alloc_read_buffer(op_data->stripes, cur_op->buf = alloc_read_buffer(op_data->stripes, op_data->pg_data_size, 0);
(op_data->scheme == POOL_SCHEME_REPLICATED ? 1 : pg.pg_minsize), 0);
submit_primary_subops(SUBMIT_READ, op_data->target_ver, submit_primary_subops(SUBMIT_READ, op_data->target_ver,
(op_data->scheme == POOL_SCHEME_REPLICATED ? pg.pg_size : pg.pg_minsize), pg.cur_set.data(), cur_op); (op_data->scheme == POOL_SCHEME_REPLICATED ? pg.pg_size : op_data->pg_data_size), pg.cur_set.data(), cur_op);
op_data->st = 1; op_data->st = 1;
} }
else else
{ {
// PG may be degraded or have misplaced objects // PG may be degraded or have misplaced objects
uint64_t* cur_set = get_object_osd_set(pg, op_data->oid, pg.cur_set.data(), &op_data->object_state); uint64_t* cur_set = get_object_osd_set(pg, op_data->oid, pg.cur_set.data(), &op_data->object_state);
if (extend_missing_stripes(op_data->stripes, cur_set, pg.pg_minsize, pg.pg_size) < 0) if (extend_missing_stripes(op_data->stripes, cur_set, op_data->pg_data_size, pg.pg_size) < 0)
{ {
finish_op(cur_op, -EIO); finish_op(cur_op, -EIO);
return; return;
} }
// Submit reads // Submit reads
op_data->pg_minsize = pg.pg_minsize;
op_data->pg_size = pg.pg_size; op_data->pg_size = pg.pg_size;
op_data->scheme = pg.scheme;
op_data->degraded = 1; op_data->degraded = 1;
cur_op->buf = alloc_read_buffer(op_data->stripes, pg.pg_size, 0); cur_op->buf = alloc_read_buffer(op_data->stripes, pg.pg_size, 0);
submit_primary_subops(SUBMIT_READ, op_data->target_ver, pg.pg_size, cur_set, cur_op); submit_primary_subops(SUBMIT_READ, op_data->target_ver, pg.pg_size, cur_set, cur_op);
@@ -147,14 +149,17 @@ resume_2:
if (op_data->degraded) if (op_data->degraded)
{ {
// Reconstruct missing stripes // Reconstruct missing stripes
// FIXME: Always EC(k+1) by now. Add different coding schemes
osd_rmw_stripe_t *stripes = op_data->stripes; osd_rmw_stripe_t *stripes = op_data->stripes;
for (int role = 0; role < op_data->pg_minsize; role++) if (op_data->scheme == POOL_SCHEME_XOR)
{
reconstruct_stripes_xor(stripes, op_data->pg_size);
}
else if (op_data->scheme == POOL_SCHEME_JERASURE)
{
reconstruct_stripes_jerasure(stripes, op_data->pg_size, op_data->pg_data_size);
}
for (int role = 0; role < op_data->pg_size; role++)
{ {
if (stripes[role].read_end != 0 && stripes[role].missing)
{
reconstruct_stripe_xor(stripes, op_data->pg_size, role);
}
if (stripes[role].req_end != 0) if (stripes[role].req_end != 0)
{ {
// Send buffer in parts to avoid copying // Send buffer in parts to avoid copying
@@ -245,7 +250,7 @@ resume_1:
else else
{ {
cur_op->rmw_buf = calc_rmw(cur_op->buf, op_data->stripes, op_data->prev_set, cur_op->rmw_buf = calc_rmw(cur_op->buf, op_data->stripes, op_data->prev_set,
pg.pg_size, pg.pg_minsize, pg.pg_cursize, pg.cur_set.data(), bs_block_size); pg.pg_size, op_data->pg_data_size, pg.pg_cursize, pg.cur_set.data(), bs_block_size);
if (!cur_op->rmw_buf) if (!cur_op->rmw_buf)
{ {
// Refuse partial overwrite of an incomplete object // Refuse partial overwrite of an incomplete object
@@ -285,7 +290,14 @@ resume_3:
else else
{ {
// Recover missing stripes, calculate parity // Recover missing stripes, calculate parity
calc_rmw_parity_xor(op_data->stripes, pg.pg_size, op_data->prev_set, pg.cur_set.data(), bs_block_size); if (pg.scheme == POOL_SCHEME_XOR)
{
calc_rmw_parity_xor(op_data->stripes, pg.pg_size, op_data->prev_set, pg.cur_set.data(), bs_block_size);
}
else if (pg.scheme == POOL_SCHEME_JERASURE)
{
calc_rmw_parity_jerasure(op_data->stripes, pg.pg_size, op_data->pg_data_size, op_data->prev_set, pg.cur_set.data(), bs_block_size);
}
} }
// Send writes // Send writes
if ((op_data->fact_ver >> (64-PG_EPOCH_BITS)) < pg.epoch) if ((op_data->fact_ver >> (64-PG_EPOCH_BITS)) < pg.epoch)

View File

@@ -25,7 +25,7 @@ struct osd_primary_op_data_t
uint64_t fact_ver = 0; uint64_t fact_ver = 0;
uint64_t scheme = 0; uint64_t scheme = 0;
int n_subops = 0, done = 0, errors = 0, epipe = 0; int n_subops = 0, done = 0, errors = 0, epipe = 0;
int degraded = 0, pg_size, pg_minsize; int degraded = 0, pg_size, pg_data_size;
osd_rmw_stripe_t *stripes; osd_rmw_stripe_t *stripes;
osd_op_t *subops = NULL; osd_op_t *subops = NULL;
uint64_t *prev_set = NULL; uint64_t *prev_set = NULL;

View File

@@ -11,7 +11,7 @@ void osd_t::autosync()
{ {
autosync_op = new osd_op_t(); autosync_op = new osd_op_t();
autosync_op->op_type = OSD_OP_IN; autosync_op->op_type = OSD_OP_IN;
autosync_op->req = { autosync_op->req = (osd_any_op_t){
.sync = { .sync = {
.header = { .header = {
.magic = SECONDARY_OSD_OP_MAGIC, .magic = SECONDARY_OSD_OP_MAGIC,
@@ -510,7 +510,7 @@ void osd_t::submit_primary_stab_subops(osd_op_t *cur_op)
{ {
clock_gettime(CLOCK_REALTIME, &subops[i].tv_begin); clock_gettime(CLOCK_REALTIME, &subops[i].tv_begin);
subops[i].op_type = (uint64_t)cur_op; subops[i].op_type = (uint64_t)cur_op;
subops[i].bs_op = new blockstore_op_t({ subops[i].bs_op = new blockstore_op_t((blockstore_op_t){
.opcode = BS_OP_STABLE, .opcode = BS_OP_STABLE,
.callback = [subop = &subops[i], this](blockstore_op_t *bs_subop) .callback = [subop = &subops[i], this](blockstore_op_t *bs_subop)
{ {

View File

@@ -3,6 +3,9 @@
#include <string.h> #include <string.h>
#include <assert.h> #include <assert.h>
#include <jerasure/reed_sol.h>
#include <jerasure.h>
#include <map>
#include "xor.h" #include "xor.h"
#include "osd_rmw.h" #include "osd_rmw.h"
#include "malloc_or_die.h" #include "malloc_or_die.h"
@@ -75,44 +78,189 @@ void split_stripes(uint64_t pg_minsize, uint32_t bs_block_size, uint32_t start,
} }
} }
void reconstruct_stripe_xor(osd_rmw_stripe_t *stripes, int pg_size, int role) void reconstruct_stripes_xor(osd_rmw_stripe_t *stripes, int pg_size)
{ {
int prev = -2; for (int role = 0; role < pg_size; role++)
for (int other = 0; other < pg_size; other++)
{ {
if (other != role) if (stripes[role].read_end != 0 && stripes[role].missing)
{ {
if (prev == -2) // Reconstruct missing stripe (XOR k+1)
int prev = -2;
for (int other = 0; other < pg_size; other++)
{ {
prev = other; if (other != role)
} {
else if (prev >= 0) if (prev == -2)
{ {
assert(stripes[role].read_start >= stripes[prev].read_start && prev = other;
stripes[role].read_start >= stripes[other].read_start); }
memxor( else if (prev >= 0)
stripes[prev].read_buf + (stripes[role].read_start - stripes[prev].read_start), {
stripes[other].read_buf + (stripes[role].read_start - stripes[other].read_start), assert(stripes[role].read_start >= stripes[prev].read_start &&
stripes[role].read_buf, stripes[role].read_end - stripes[role].read_start stripes[role].read_start >= stripes[other].read_start);
); memxor(
prev = -1; stripes[prev].read_buf + (stripes[role].read_start - stripes[prev].read_start),
} stripes[other].read_buf + (stripes[role].read_start - stripes[other].read_start),
else stripes[role].read_buf, stripes[role].read_end - stripes[role].read_start
{ );
assert(stripes[role].read_start >= stripes[other].read_start); prev = -1;
memxor( }
stripes[role].read_buf, else
stripes[other].read_buf + (stripes[role].read_start - stripes[other].read_start), {
stripes[role].read_buf, stripes[role].read_end - stripes[role].read_start assert(stripes[role].read_start >= stripes[other].read_start);
); memxor(
stripes[role].read_buf,
stripes[other].read_buf + (stripes[role].read_start - stripes[other].read_start),
stripes[role].read_buf, stripes[role].read_end - stripes[role].read_start
);
}
}
} }
} }
} }
} }
int extend_missing_stripes(osd_rmw_stripe_t *stripes, osd_num_t *osd_set, int minsize, int size) struct reed_sol_erased_t
{ {
for (int role = 0; role < minsize; role++) int *data;
int size;
};
inline bool operator < (const reed_sol_erased_t &a, const reed_sol_erased_t &b)
{
for (int i = 0; i < a.size && i < b.size; i++)
{
if (a.data[i] < b.data[i])
return -1;
else if (a.data[i] > b.data[i])
return 1;
}
return 0;
}
struct reed_sol_matrix_t
{
int refs = 0;
int *data;
std::map<reed_sol_erased_t, int*> decodings;
};
std::map<uint64_t, reed_sol_matrix_t> matrices;
void use_jerasure(int pg_size, int pg_minsize, bool use)
{
uint64_t key = (uint64_t)pg_size | ((uint64_t)pg_minsize) << 32;
auto rs_it = matrices.find(key);
if (rs_it == matrices.end())
{
if (!use)
{
return;
}
int *matrix = reed_sol_vandermonde_coding_matrix(pg_minsize, pg_size-pg_minsize, 32);
matrices[key] = (reed_sol_matrix_t){
.refs = 0,
.data = matrix,
};
rs_it = matrices.find(key);
}
rs_it->second.refs += (!use ? -1 : 1);
if (rs_it->second.refs <= 0)
{
free(rs_it->second.data);
for (auto dec_it = rs_it->second.decodings.begin(); dec_it != rs_it->second.decodings.end();)
{
int *data = dec_it->second;
rs_it->second.decodings.erase(dec_it++);
free(data);
}
matrices.erase(rs_it);
}
}
reed_sol_matrix_t* get_jerasure_matrix(int pg_size, int pg_minsize)
{
uint64_t key = (uint64_t)pg_size | ((uint64_t)pg_minsize) << 32;
auto rs_it = matrices.find(key);
if (rs_it == matrices.end())
{
throw std::runtime_error("jerasure matrix not initialized");
}
return &rs_it->second;
}
// jerasure_matrix_decode() decodes all chunks at once and tries to reencode all missing coding chunks.
// we don't need it. also it makes an extra allocation of int *erased on every call and doesn't cache
// the decoding matrix.
// all these flaws are fixed in this function:
int* get_jerasure_decoding_matrix(osd_rmw_stripe_t *stripes, int pg_size, int pg_minsize)
{
int edd = 0;
int erased[pg_size] = { 0 };
for (int i = 0; i < pg_size; i++)
if (stripes[i].read_end == 0 || stripes[i].missing)
erased[i] = 1;
for (int i = 0; i < pg_minsize; i++)
if (stripes[i].read_end != 0 && stripes[i].missing)
edd++;
if (edd == 0)
return NULL;
reed_sol_matrix_t *matrix = get_jerasure_matrix(pg_size, pg_minsize);
auto dec_it = matrix->decodings.find((reed_sol_erased_t){ .data = erased, .size = pg_size });
if (dec_it == matrix->decodings.end())
{
int *dm_ids = (int*)malloc(sizeof(int)*(pg_minsize + pg_minsize*pg_minsize + pg_size));
int *decoding_matrix = dm_ids + pg_minsize;
if (!dm_ids)
throw std::bad_alloc();
// we always use row_k_ones=1 and w=32
if (jerasure_make_decoding_matrix(pg_minsize, pg_size-pg_minsize, 32, matrix->data, erased, decoding_matrix, dm_ids) < 0)
{
free(dm_ids);
throw std::runtime_error("jerasure_make_decoding_matrix() failed");
}
int *erased_copy = dm_ids + pg_minsize + pg_minsize*pg_minsize;
memcpy(erased_copy, erased, pg_size*sizeof(int));
matrix->decodings.emplace((reed_sol_erased_t){ .data = erased_copy, .size = pg_size }, dm_ids);
return dm_ids;
}
return dec_it->second;
}
void reconstruct_stripes_jerasure(osd_rmw_stripe_t *stripes, int pg_size, int pg_minsize)
{
int *dm_ids = get_jerasure_decoding_matrix(stripes, pg_size, pg_minsize);
if (!dm_ids)
{
return;
}
int *decoding_matrix = dm_ids + pg_minsize;
char *data_ptrs[pg_size] = { 0 };
for (int role = 0; role < pg_minsize; role++)
{
if (stripes[role].read_end != 0 && stripes[role].missing)
{
for (int other = 0; other < pg_size; other++)
{
if (stripes[other].read_end != 0 && !stripes[other].missing)
{
assert(stripes[other].read_start <= stripes[role].read_start);
assert(stripes[other].read_end >= stripes[role].read_end);
data_ptrs[other] = (char*)(stripes[other].read_buf + (stripes[role].read_start - stripes[other].read_start));
}
}
data_ptrs[role] = (char*)stripes[role].read_buf;
jerasure_matrix_dotprod(
pg_minsize, 32, decoding_matrix+(role*pg_minsize), dm_ids, role,
data_ptrs, data_ptrs+pg_minsize, stripes[role].read_end - stripes[role].read_start
);
}
}
}
int extend_missing_stripes(osd_rmw_stripe_t *stripes, osd_num_t *osd_set, int pg_minsize, int pg_size)
{
for (int role = 0; role < pg_minsize; role++)
{ {
if (stripes[role].read_end != 0 && osd_set[role] == 0) if (stripes[role].read_end != 0 && osd_set[role] == 0)
{ {
@@ -121,21 +269,21 @@ int extend_missing_stripes(osd_rmw_stripe_t *stripes, osd_num_t *osd_set, int mi
// We need at least pg_minsize stripes to recover the lost part. // We need at least pg_minsize stripes to recover the lost part.
// FIXME: LRC EC and similar don't require to read all other stripes. // FIXME: LRC EC and similar don't require to read all other stripes.
int exist = 0; int exist = 0;
for (int j = 0; j < size; j++) for (int j = 0; j < pg_size; j++)
{ {
if (osd_set[j] != 0) if (osd_set[j] != 0)
{ {
extend_read(stripes[role].read_start, stripes[role].read_end, stripes[j]); extend_read(stripes[role].read_start, stripes[role].read_end, stripes[j]);
exist++; exist++;
if (exist >= minsize) if (exist >= pg_minsize)
{ {
break; break;
} }
} }
} }
if (exist < minsize) if (exist < pg_minsize)
{ {
// Less than minsize stripes are available for this object // Less than pg_minsize stripes are available for this object
return -1; return -1;
} }
} }
@@ -369,19 +517,9 @@ static void xor_multiple_buffers(buf_len_t *xor1, int n1, buf_len_t *xor2, int n
} }
} }
void calc_rmw_parity_xor(osd_rmw_stripe_t *stripes, int pg_size, uint64_t *read_osd_set, uint64_t *write_osd_set, uint32_t chunk_size) static void calc_rmw_parity_copy_mod(osd_rmw_stripe_t *stripes, int pg_size, int pg_minsize,
uint64_t *read_osd_set, uint64_t *write_osd_set, uint32_t chunk_size, uint32_t &start, uint32_t &end)
{ {
int pg_minsize = pg_size-1;
for (int role = 0; role < pg_size; role++)
{
if (stripes[role].read_end != 0 && stripes[role].missing)
{
// Reconstruct missing stripe (XOR k+1)
reconstruct_stripe_xor(stripes, pg_size, role);
break;
}
}
uint32_t start = 0, end = 0;
if (write_osd_set[pg_minsize] != 0 || write_osd_set != read_osd_set) if (write_osd_set[pg_minsize] != 0 || write_osd_set != read_osd_set)
{ {
// Required for the next two if()s // Required for the next two if()s
@@ -421,6 +559,53 @@ void calc_rmw_parity_xor(osd_rmw_stripe_t *stripes, int pg_size, uint64_t *read_
} }
} }
} }
}
static void calc_rmw_parity_copy_parity(osd_rmw_stripe_t *stripes, int pg_size, int pg_minsize,
uint64_t *read_osd_set, uint64_t *write_osd_set, uint32_t chunk_size, uint32_t start, uint32_t end)
{
if (write_osd_set != read_osd_set)
{
for (int role = pg_minsize; role < pg_size; role++)
{
if (write_osd_set[role] != read_osd_set[role] && (start != 0 || end != chunk_size))
{
// Copy new parity into the read buffer to write it back
memcpy(
stripes[role].read_buf + start,
stripes[role].write_buf,
end - start
);
stripes[role].write_buf = stripes[role].read_buf;
stripes[role].write_start = 0;
stripes[role].write_end = chunk_size;
}
}
}
#ifdef RMW_DEBUG
printf("calc_rmw_parity:\n");
for (int role = 0; role < pg_size; role++)
{
auto & s = stripes[role];
printf(
"Tr=%lu Tw=%lu Q=%x-%x R=%x-%x W=%x-%x Rb=%lx Wb=%lx\n",
read_osd_set[role], write_osd_set[role],
s.req_start, s.req_end,
s.read_start, s.read_end,
s.write_start, s.write_end,
(uint64_t)s.read_buf,
(uint64_t)s.write_buf
);
}
#endif
}
void calc_rmw_parity_xor(osd_rmw_stripe_t *stripes, int pg_size, uint64_t *read_osd_set, uint64_t *write_osd_set, uint32_t chunk_size)
{
int pg_minsize = pg_size-1;
reconstruct_stripes_xor(stripes, pg_size);
uint32_t start = 0, end = 0;
calc_rmw_parity_copy_mod(stripes, pg_size, pg_minsize, read_osd_set, write_osd_set, chunk_size, start, end);
if (write_osd_set[pg_minsize] != 0 && end != 0) if (write_osd_set[pg_minsize] != 0 && end != 0)
{ {
// Calculate new parity (XOR k+1) // Calculate new parity (XOR k+1)
@@ -449,38 +634,71 @@ void calc_rmw_parity_xor(osd_rmw_stripe_t *stripes, int pg_size, uint64_t *read_
} }
} }
} }
if (write_osd_set != read_osd_set) calc_rmw_parity_copy_parity(stripes, pg_size, pg_minsize, read_osd_set, write_osd_set, chunk_size, start, end);
}
void calc_rmw_parity_jerasure(osd_rmw_stripe_t *stripes, int pg_size, int pg_minsize,
uint64_t *read_osd_set, uint64_t *write_osd_set, uint32_t chunk_size)
{
reed_sol_matrix_t *matrix = get_jerasure_matrix(pg_size, pg_minsize);
reconstruct_stripes_jerasure(stripes, pg_size, pg_minsize);
uint32_t start = 0, end = 0;
calc_rmw_parity_copy_mod(stripes, pg_size, pg_minsize, read_osd_set, write_osd_set, chunk_size, start, end);
if (end != 0)
{ {
for (int role = pg_minsize; role < pg_size; role++) int i;
for (i = pg_minsize; i < pg_size; i++)
{ {
if (write_osd_set[role] != read_osd_set[role] && (start != 0 || end != chunk_size)) if (write_osd_set[i] != 0)
break;
}
if (i < pg_size)
{
// Calculate new coding chunks
buf_len_t bufs[pg_size][3];
int nbuf[pg_size] = { 0 }, curbuf[pg_size] = { 0 };
uint32_t positions[pg_size];
void *data_ptrs[pg_size] = { 0 };
for (int i = 0; i < pg_minsize; i++)
{ {
// Copy new parity into the read buffer to write it back get_old_new_buffers(stripes[i], start, end, bufs[i], nbuf[i]);
memcpy( positions[i] = start;
stripes[role].read_buf + start, }
stripes[role].write_buf, for (int i = pg_minsize; i < pg_size; i++)
end - start {
bufs[i][nbuf[i]++] = { .buf = stripes[i].write_buf, .len = end-start };
positions[i] = start;
}
uint32_t pos = start;
while (pos < end)
{
uint32_t next_end = end;
for (int i = 0; i < pg_size; i++)
{
assert(curbuf[i] < nbuf[i]);
assert(bufs[i][curbuf[i]].buf);
data_ptrs[i] = bufs[i][curbuf[i]].buf + pos-positions[i];
uint32_t this_end = bufs[i][curbuf[i]].len + positions[i];
if (next_end > this_end)
next_end = this_end;
}
assert(next_end > pos);
for (int i = 0; i < pg_size; i++)
{
uint32_t this_end = bufs[i][curbuf[i]].len + positions[i];
if (next_end >= this_end)
{
positions[i] += bufs[i][curbuf[i]].len;
curbuf[i]++;
}
}
jerasure_matrix_encode(
pg_minsize, pg_size-pg_minsize, 32, matrix->data,
(char**)data_ptrs, (char**)data_ptrs+pg_minsize, next_end-pos
); );
stripes[role].write_buf = stripes[role].read_buf; pos = next_end;
stripes[role].write_start = 0;
stripes[role].write_end = chunk_size;
} }
} }
} }
#ifdef RMW_DEBUG calc_rmw_parity_copy_parity(stripes, pg_size, pg_minsize, read_osd_set, write_osd_set, chunk_size, start, end);
printf("calc_rmw_xor:\n");
for (int role = 0; role < pg_size; role++)
{
auto & s = stripes[role];
printf(
"Tr=%lu Tw=%lu Q=%x-%x R=%x-%x W=%x-%x Rb=%lx Wb=%lx\n",
read_osd_set[role], write_osd_set[role],
s.req_start, s.req_end,
s.read_start, s.read_end,
s.write_start, s.write_end,
(uint64_t)s.read_buf,
(uint64_t)s.write_buf
);
}
#endif
} }

View File

@@ -26,11 +26,13 @@ struct osd_rmw_stripe_t
bool missing; bool missing;
}; };
// Here pg_minsize is the number of data chunks, not the minimum number of alive OSDs for the PG to operate
void split_stripes(uint64_t pg_minsize, uint32_t bs_block_size, uint32_t start, uint32_t len, osd_rmw_stripe_t *stripes); void split_stripes(uint64_t pg_minsize, uint32_t bs_block_size, uint32_t start, uint32_t len, osd_rmw_stripe_t *stripes);
void reconstruct_stripe_xor(osd_rmw_stripe_t *stripes, int pg_size, int role); void reconstruct_stripes_xor(osd_rmw_stripe_t *stripes, int pg_size);
int extend_missing_stripes(osd_rmw_stripe_t *stripes, osd_num_t *osd_set, int minsize, int size); int extend_missing_stripes(osd_rmw_stripe_t *stripes, osd_num_t *osd_set, int pg_minsize, int pg_size);
void* alloc_read_buffer(osd_rmw_stripe_t *stripes, int read_pg_size, uint64_t add_size); void* alloc_read_buffer(osd_rmw_stripe_t *stripes, int read_pg_size, uint64_t add_size);
@@ -38,3 +40,10 @@ void* calc_rmw(void *request_buf, osd_rmw_stripe_t *stripes, uint64_t *read_osd_
uint64_t pg_size, uint64_t pg_minsize, uint64_t pg_cursize, uint64_t *write_osd_set, uint64_t chunk_size); uint64_t pg_size, uint64_t pg_minsize, uint64_t pg_cursize, uint64_t *write_osd_set, uint64_t chunk_size);
void calc_rmw_parity_xor(osd_rmw_stripe_t *stripes, int pg_size, uint64_t *read_osd_set, uint64_t *write_osd_set, uint32_t chunk_size); void calc_rmw_parity_xor(osd_rmw_stripe_t *stripes, int pg_size, uint64_t *read_osd_set, uint64_t *write_osd_set, uint32_t chunk_size);
void use_jerasure(int pg_size, int pg_minsize, bool use);
void reconstruct_stripes_jerasure(osd_rmw_stripe_t *stripes, int pg_size, int pg_minsize);
void calc_rmw_parity_jerasure(osd_rmw_stripe_t *stripes, int pg_size, int pg_minsize,
uint64_t *read_osd_set, uint64_t *write_osd_set, uint32_t chunk_size);

View File

@@ -18,107 +18,8 @@ void test9();
void test10(); void test10();
void test11(); void test11();
void test12(); void test12();
void test13();
/*** void test14();
Cases:
1. split(offset=128K-4K, len=8K)
= [ [ 128K-4K, 128K ], [ 0, 4K ], [ 0, 0 ] ]
2. read(offset=128K-4K, len=8K, osd_set=[1,0,3])
= { read: [ [ 0, 128K ], [ 0, 4K ], [ 0, 4K ] ] }
3. cover_read(0, 128K, { req: [ 128K-4K, 4K ] })
= { read: [ 0, 128K-4K ] }
4. write(offset=128K-4K, len=8K, osd_set=[1,0,3])
= {
read: [ [ 0, 128K ], [ 4K, 128K ], [ 4K, 128K ] ],
write: [ [ 128K-4K, 128K ], [ 0, 4K ], [ 0, 128K ] ],
input buffer: [ write0, write1 ],
rmw buffer: [ write2, read0, read1, read2 ],
}
+ check write2 buffer
5. write(offset=0, len=128K+64K, osd_set=[1,0,3])
= {
req: [ [ 0, 128K ], [ 0, 64K ], [ 0, 0 ] ],
read: [ [ 64K, 128K ], [ 64K, 128K ], [ 64K, 128K ] ],
write: [ [ 0, 128K ], [ 0, 64K ], [ 0, 128K ] ],
input buffer: [ write0, write1 ],
rmw buffer: [ write2, read0, read1, read2 ],
}
6. write(offset=0, len=128K+64K, osd_set=[1,2,3])
= {
req: [ [ 0, 128K ], [ 0, 64K ], [ 0, 0 ] ],
read: [ [ 0, 0 ], [ 64K, 128K ], [ 0, 0 ] ],
write: [ [ 0, 128K ], [ 0, 64K ], [ 0, 128K ] ],
input buffer: [ write0, write1 ],
rmw buffer: [ write2, read1 ],
}
7. calc_rmw(offset=128K-4K, len=8K, osd_set=[1,0,3], write_set=[1,2,3])
= {
read: [ [ 0, 128K ], [ 0, 128K ], [ 0, 128K ] ],
write: [ [ 128K-4K, 128K ], [ 0, 4K ], [ 0, 128K ] ],
input buffer: [ write0, write1 ],
rmw buffer: [ write2, read0, read1, read2 ],
}
then, after calc_rmw_parity_xor(): {
write: [ [ 128K-4K, 128K ], [ 0, 128K ], [ 0, 128K ] ],
write1==read1,
}
+ check write1 buffer
+ check write2 buffer
8. calc_rmw(offset=0, len=128K+4K, osd_set=[0,2,3], write_set=[1,2,3])
= {
read: [ [ 0, 0 ], [ 4K, 128K ], [ 0, 0 ] ],
write: [ [ 0, 128K ], [ 0, 4K ], [ 0, 128K ] ],
input buffer: [ write0, write1 ],
rmw buffer: [ write2, read1 ],
}
+ check write2 buffer
9. object recovery case:
calc_rmw(offset=0, len=0, read_osd_set=[0,2,3], write_osd_set=[1,2,3])
= {
read: [ [ 0, 128K ], [ 0, 128K ], [ 0, 128K ] ],
write: [ [ 0, 0 ], [ 0, 0 ], [ 0, 0 ] ],
input buffer: NULL,
rmw buffer: [ read0, read1, read2 ],
}
then, after calc_rmw_parity_xor(): {
write: [ [ 0, 128K ], [ 0, 0 ], [ 0, 0 ] ],
write0==read0,
}
+ check write0 buffer
10. full overwrite/recovery case:
calc_rmw(offset=0, len=256K, read_osd_set=[1,0,0], write_osd_set=[1,2,3])
= {
read: [ [ 0, 0 ], [ 0, 0 ], [ 0, 0 ] ],
write: [ [ 0, 128K ], [ 0, 128K ], [ 0, 128K ] ],
input buffer: [ write0, write1 ],
rmw buffer: [ write2 ],
}
then, after calc_rmw_parity_xor(): all the same
+ check write2 buffer
10. partial recovery case:
calc_rmw(offset=128K, len=128K, read_osd_set=[1,0,0], write_osd_set=[1,2,3])
= {
read: [ [ 0, 128K ], [ 0, 0 ], [ 0, 0 ] ],
write: [ [ 0, 0 ], [ 0, 128K ], [ 0, 128K ] ],
input buffer: [ write1 ],
rmw buffer: [ write2, read0 ],
}
then, after calc_rmw_parity_xor(): all the same
+ check write2 buffer
***/
int main(int narg, char *args[]) int main(int narg, char *args[])
{ {
@@ -142,6 +43,10 @@ int main(int narg, char *args[])
test11(); test11();
// Test 12 // Test 12
test12(); test12();
// Test 13
test13();
// Test 14
test14();
// End // End
printf("all ok\n"); printf("all ok\n");
return 0; return 0;
@@ -169,6 +74,19 @@ void dump_stripes(osd_rmw_stripe_t *stripes, int pg_size)
printf("\n"); printf("\n");
} }
/***
1. split(offset=128K-4K, len=8K)
= [ [ 128K-4K, 128K ], [ 0, 4K ], [ 0, 0 ] ]
read(offset=128K-4K, len=8K, osd_set=[1,0,3])
= { read: [ [ 0, 128K ], [ 0, 4K ], [ 0, 4K ] ] }
cover_read(0, 128K, { req: [ 128K-4K, 4K ] })
= { read: [ 0, 128K-4K ] }
***/
void test1() void test1()
{ {
osd_num_t osd_set[3] = { 1, 0, 3 }; osd_num_t osd_set[3] = { 1, 0, 3 };
@@ -188,11 +106,24 @@ void test1()
assert(stripes[0].read_start == 0 && stripes[0].read_end == 128*1024); assert(stripes[0].read_start == 0 && stripes[0].read_end == 128*1024);
assert(stripes[2].read_start == 0 && stripes[2].read_end == 4096); assert(stripes[2].read_start == 0 && stripes[2].read_end == 4096);
// Test 1.3 // Test 1.3
stripes[0] = { .req_start = 128*1024-4096, .req_end = 128*1024 }; stripes[0] = (osd_rmw_stripe_t){ .req_start = 128*1024-4096, .req_end = 128*1024 };
cover_read(0, 128*1024, stripes[0]); cover_read(0, 128*1024, stripes[0]);
assert(stripes[0].read_start == 0 && stripes[0].read_end == 128*1024-4096); assert(stripes[0].read_start == 0 && stripes[0].read_end == 128*1024-4096);
} }
/***
4. write(offset=128K-4K, len=8K, osd_set=[1,0,3])
= {
read: [ [ 0, 128K ], [ 4K, 128K ], [ 4K, 128K ] ],
write: [ [ 128K-4K, 128K ], [ 0, 4K ], [ 0, 128K ] ],
input buffer: [ write0, write1 ],
rmw buffer: [ write2, read0, read1, read2 ],
}
+ check write2 buffer
***/
void test4() void test4()
{ {
osd_num_t osd_set[3] = { 1, 0, 3 }; osd_num_t osd_set[3] = { 1, 0, 3 };
@@ -226,6 +157,19 @@ void test4()
free(write_buf); free(write_buf);
} }
/***
5. write(offset=0, len=128K+64K, osd_set=[1,0,3])
= {
req: [ [ 0, 128K ], [ 0, 64K ], [ 0, 0 ] ],
read: [ [ 64K, 128K ], [ 64K, 128K ], [ 64K, 128K ] ],
write: [ [ 0, 128K ], [ 0, 64K ], [ 0, 128K ] ],
input buffer: [ write0, write1 ],
rmw buffer: [ write2, read0, read1, read2 ],
}
***/
void test5() void test5()
{ {
osd_num_t osd_set[3] = { 1, 0, 3 }; osd_num_t osd_set[3] = { 1, 0, 3 };
@@ -254,6 +198,19 @@ void test5()
free(write_buf); free(write_buf);
} }
/***
6. write(offset=0, len=128K+64K, osd_set=[1,2,3])
= {
req: [ [ 0, 128K ], [ 0, 64K ], [ 0, 0 ] ],
read: [ [ 0, 0 ], [ 64K, 128K ], [ 0, 0 ] ],
write: [ [ 0, 128K ], [ 0, 64K ], [ 0, 128K ] ],
input buffer: [ write0, write1 ],
rmw buffer: [ write2, read1 ],
}
***/
void test6() void test6()
{ {
osd_num_t osd_set[3] = { 1, 2, 3 }; osd_num_t osd_set[3] = { 1, 2, 3 };
@@ -278,6 +235,24 @@ void test6()
free(write_buf); free(write_buf);
} }
/***
7. calc_rmw(offset=128K-4K, len=8K, osd_set=[1,0,3], write_set=[1,2,3])
= {
read: [ [ 0, 128K ], [ 0, 128K ], [ 0, 128K ] ],
write: [ [ 128K-4K, 128K ], [ 0, 4K ], [ 0, 128K ] ],
input buffer: [ write0, write1 ],
rmw buffer: [ write2, read0, read1, read2 ],
}
then, after calc_rmw_parity_xor(): {
write: [ [ 128K-4K, 128K ], [ 0, 128K ], [ 0, 128K ] ],
write1==read1,
}
+ check write1 buffer
+ check write2 buffer
***/
void test7() void test7()
{ {
osd_num_t osd_set[3] = { 1, 0, 3 }; osd_num_t osd_set[3] = { 1, 0, 3 };
@@ -318,6 +293,19 @@ void test7()
free(write_buf); free(write_buf);
} }
/***
8. calc_rmw(offset=0, len=128K+4K, osd_set=[0,2,3], write_set=[1,2,3])
= {
read: [ [ 0, 0 ], [ 4K, 128K ], [ 0, 0 ] ],
write: [ [ 0, 128K ], [ 0, 4K ], [ 0, 128K ] ],
input buffer: [ write0, write1 ],
rmw buffer: [ write2, read1 ],
}
+ check write2 buffer
***/
void test8() void test8()
{ {
osd_num_t osd_set[3] = { 0, 2, 3 }; osd_num_t osd_set[3] = { 0, 2, 3 };
@@ -355,6 +343,24 @@ void test8()
free(write_buf); free(write_buf);
} }
/***
9. object recovery case:
calc_rmw(offset=0, len=0, read_osd_set=[0,2,3], write_osd_set=[1,2,3])
= {
read: [ [ 0, 128K ], [ 0, 128K ], [ 0, 128K ] ],
write: [ [ 0, 0 ], [ 0, 0 ], [ 0, 0 ] ],
input buffer: NULL,
rmw buffer: [ read0, read1, read2 ],
}
then, after calc_rmw_parity_xor(): {
write: [ [ 0, 128K ], [ 0, 0 ], [ 0, 0 ] ],
write0==read0,
}
+ check write0 buffer
***/
void test9() void test9()
{ {
osd_num_t osd_set[3] = { 0, 2, 3 }; osd_num_t osd_set[3] = { 0, 2, 3 };
@@ -395,6 +401,21 @@ void test9()
free(rmw_buf); free(rmw_buf);
} }
/***
10. full overwrite/recovery case:
calc_rmw(offset=0, len=256K, read_osd_set=[1,0,0], write_osd_set=[1,2,3])
= {
read: [ [ 0, 0 ], [ 0, 0 ], [ 0, 0 ] ],
write: [ [ 0, 128K ], [ 0, 128K ], [ 0, 128K ] ],
input buffer: [ write0, write1 ],
rmw buffer: [ write2 ],
}
then, after calc_rmw_parity_xor(): all the same
+ check write2 buffer
***/
void test10() void test10()
{ {
osd_num_t osd_set[3] = { 1, 0, 0 }; osd_num_t osd_set[3] = { 1, 0, 0 };
@@ -436,6 +457,21 @@ void test10()
free(write_buf); free(write_buf);
} }
/***
11. partial recovery case:
calc_rmw(offset=128K, len=128K, read_osd_set=[1,0,0], write_osd_set=[1,2,3])
= {
read: [ [ 0, 128K ], [ 0, 0 ], [ 0, 0 ] ],
write: [ [ 0, 0 ], [ 0, 128K ], [ 0, 128K ] ],
input buffer: [ write1 ],
rmw buffer: [ write2, read0 ],
}
then, after calc_rmw_parity_xor(): all the same
+ check write2 buffer
***/
void test11() void test11()
{ {
osd_num_t osd_set[3] = { 1, 0, 0 }; osd_num_t osd_set[3] = { 1, 0, 0 };
@@ -477,17 +513,32 @@ void test11()
free(write_buf); free(write_buf);
} }
/***
12. parity recovery case:
calc_rmw(offset=0, len=0, read_osd_set=[1,2,0], write_osd_set=[1,2,3])
= {
read: [ [ 0, 128K ], [ 0, 128K ], [ 0, 0 ] ],
write: [ [ 0, 0 ], [ 0, 0 ], [ 0, 128K ] ],
input buffer: [],
rmw buffer: [ write2, read0, read1 ],
}
then, after calc_rmw_parity_xor(): all the same
+ check write2 buffer
***/
void test12() void test12()
{ {
osd_num_t osd_set[3] = { 1, 2, 0 }; osd_num_t osd_set[3] = { 1, 2, 0 };
osd_num_t write_osd_set[3] = { 1, 2, 3 }; osd_num_t write_osd_set[3] = { 1, 2, 3 };
osd_rmw_stripe_t stripes[3] = { 0 }; osd_rmw_stripe_t stripes[3] = { 0 };
// Test 11.0 // Test 12.0
split_stripes(2, 128*1024, 0, 0, stripes); split_stripes(2, 128*1024, 0, 0, stripes);
assert(stripes[0].req_start == 0 && stripes[0].req_end == 0); assert(stripes[0].req_start == 0 && stripes[0].req_end == 0);
assert(stripes[1].req_start == 0 && stripes[1].req_end == 0); assert(stripes[1].req_start == 0 && stripes[1].req_end == 0);
assert(stripes[2].req_start == 0 && stripes[2].req_end == 0); assert(stripes[2].req_start == 0 && stripes[2].req_end == 0);
// Test 11.1 // Test 12.1
void *rmw_buf = calc_rmw(NULL, stripes, osd_set, 3, 2, 3, write_osd_set, 128*1024); void *rmw_buf = calc_rmw(NULL, stripes, osd_set, 3, 2, 3, write_osd_set, 128*1024);
assert(rmw_buf); assert(rmw_buf);
assert(stripes[0].read_start == 0 && stripes[0].read_end == 128*1024); assert(stripes[0].read_start == 0 && stripes[0].read_end == 128*1024);
@@ -502,7 +553,7 @@ void test12()
assert(stripes[0].write_buf == NULL); assert(stripes[0].write_buf == NULL);
assert(stripes[1].write_buf == NULL); assert(stripes[1].write_buf == NULL);
assert(stripes[2].write_buf == rmw_buf); assert(stripes[2].write_buf == rmw_buf);
// Test 11.2 // Test 12.2
set_pattern(stripes[0].read_buf, 128*1024, PATTERN1); set_pattern(stripes[0].read_buf, 128*1024, PATTERN1);
set_pattern(stripes[1].read_buf, 128*1024, PATTERN2); set_pattern(stripes[1].read_buf, 128*1024, PATTERN2);
calc_rmw_parity_xor(stripes, 3, osd_set, write_osd_set, 128*1024); calc_rmw_parity_xor(stripes, 3, osd_set, write_osd_set, 128*1024);
@@ -515,3 +566,217 @@ void test12()
check_pattern(stripes[2].write_buf, 128*1024, PATTERN1^PATTERN2); check_pattern(stripes[2].write_buf, 128*1024, PATTERN1^PATTERN2);
free(rmw_buf); free(rmw_buf);
} }
/***
13. basic jerasure 2+2 test
calc_rmw(offset=128K-4K, len=8K, osd_set=[1,2,0,0], write_set=[1,2,3,4])
= {
read: [ [ 0, 128K ], [ 0, 128K ], [ 0, 0 ], [ 0, 0 ] ],
write: [ [ 128K-4K, 128K ], [ 0, 4K ], [ 0, 128K ], [ 0, 128K ] ],
input buffer: [ write0, write1 ],
rmw buffer: [ write2, write3, read0, read1 ],
}
then, after calc_rmw_parity_jerasure(): all the same
then simulate read with read_osd_set=[0,0,3,4] and check read0,read1 buffers
***/
void test13()
{
use_jerasure(4, 2, true);
osd_num_t osd_set[4] = { 1, 2, 0, 0 };
osd_num_t write_osd_set[4] = { 1, 2, 3, 4 };
osd_rmw_stripe_t stripes[4] = { 0 };
// Test 13.0
void *write_buf = malloc_or_die(8192);
split_stripes(2, 128*1024, 128*1024-4096, 8192, stripes);
assert(stripes[0].req_start == 128*1024-4096 && stripes[0].req_end == 128*1024);
assert(stripes[1].req_start == 0 && stripes[1].req_end == 4096);
assert(stripes[2].req_start == 0 && stripes[2].req_end == 0);
assert(stripes[3].req_start == 0 && stripes[3].req_end == 0);
// Test 13.1
void *rmw_buf = calc_rmw(write_buf, stripes, osd_set, 4, 2, 4, write_osd_set, 128*1024);
assert(rmw_buf);
assert(stripes[0].read_start == 0 && stripes[0].read_end == 128*1024-4096);
assert(stripes[1].read_start == 4096 && stripes[1].read_end == 128*1024);
assert(stripes[2].read_start == 0 && stripes[2].read_end == 0);
assert(stripes[3].read_start == 0 && stripes[3].read_end == 0);
assert(stripes[0].write_start == 128*1024-4096 && stripes[0].write_end == 128*1024);
assert(stripes[1].write_start == 0 && stripes[1].write_end == 4096);
assert(stripes[2].write_start == 0 && stripes[2].write_end == 128*1024);
assert(stripes[3].write_start == 0 && stripes[3].write_end == 128*1024);
assert(stripes[0].read_buf == rmw_buf+2*128*1024);
assert(stripes[1].read_buf == rmw_buf+3*128*1024-4096);
assert(stripes[2].read_buf == NULL);
assert(stripes[3].read_buf == NULL);
assert(stripes[0].write_buf == write_buf);
assert(stripes[1].write_buf == write_buf+4096);
assert(stripes[2].write_buf == rmw_buf);
assert(stripes[3].write_buf == rmw_buf+128*1024);
// Test 13.2 - encode
set_pattern(write_buf, 8192, PATTERN3);
set_pattern(stripes[0].read_buf, 128*1024-4096, PATTERN1);
set_pattern(stripes[1].read_buf, 128*1024-4096, PATTERN2);
calc_rmw_parity_jerasure(stripes, 4, 2, osd_set, write_osd_set, 128*1024);
assert(stripes[0].write_start == 128*1024-4096 && stripes[0].write_end == 128*1024);
assert(stripes[1].write_start == 0 && stripes[1].write_end == 4096);
assert(stripes[2].write_start == 0 && stripes[2].write_end == 128*1024);
assert(stripes[3].write_start == 0 && stripes[3].write_end == 128*1024);
assert(stripes[0].write_buf == write_buf);
assert(stripes[1].write_buf == write_buf+4096);
assert(stripes[2].write_buf == rmw_buf);
assert(stripes[3].write_buf == rmw_buf+128*1024);
// Test 13.3 - full decode and verify
osd_num_t read_osd_set[4] = { 0, 0, 3, 4 };
memset(stripes, 0, sizeof(stripes));
split_stripes(2, 128*1024, 0, 256*1024, stripes);
assert(stripes[0].req_start == 0 && stripes[0].req_end == 128*1024);
assert(stripes[1].req_start == 0 && stripes[1].req_end == 128*1024);
assert(stripes[2].req_start == 0 && stripes[2].req_end == 0);
assert(stripes[3].req_start == 0 && stripes[3].req_end == 0);
for (int role = 0; role < 4; role++)
{
stripes[role].read_start = stripes[role].req_start;
stripes[role].read_end = stripes[role].req_end;
}
assert(extend_missing_stripes(stripes, read_osd_set, 2, 4) == 0);
assert(stripes[0].read_start == 0 && stripes[0].read_end == 128*1024);
assert(stripes[1].read_start == 0 && stripes[1].read_end == 128*1024);
assert(stripes[2].read_start == 0 && stripes[2].read_end == 128*1024);
assert(stripes[3].read_start == 0 && stripes[3].read_end == 128*1024);
void *read_buf = alloc_read_buffer(stripes, 4, 0);
assert(read_buf);
assert(stripes[0].read_buf == read_buf);
assert(stripes[1].read_buf == read_buf+128*1024);
assert(stripes[2].read_buf == read_buf+2*128*1024);
assert(stripes[3].read_buf == read_buf+3*128*1024);
memcpy(read_buf+2*128*1024, rmw_buf, 128*1024);
memcpy(read_buf+3*128*1024, rmw_buf+128*1024, 128*1024);
reconstruct_stripes_jerasure(stripes, 4, 2);
check_pattern(stripes[0].read_buf, 128*1024-4096, PATTERN1);
check_pattern(stripes[0].read_buf+128*1024-4096, 4096, PATTERN3);
check_pattern(stripes[1].read_buf, 4096, PATTERN3);
check_pattern(stripes[1].read_buf+4096, 128*1024-4096, PATTERN2);
free(read_buf);
// Test 13.4 - partial decode (only 1st chunk) and verify
memset(stripes, 0, sizeof(stripes));
split_stripes(2, 128*1024, 0, 128*1024, stripes);
assert(stripes[0].req_start == 0 && stripes[0].req_end == 128*1024);
assert(stripes[1].req_start == 0 && stripes[1].req_end == 0);
assert(stripes[2].req_start == 0 && stripes[2].req_end == 0);
assert(stripes[3].req_start == 0 && stripes[3].req_end == 0);
for (int role = 0; role < 4; role++)
{
stripes[role].read_start = stripes[role].req_start;
stripes[role].read_end = stripes[role].req_end;
}
assert(extend_missing_stripes(stripes, read_osd_set, 2, 4) == 0);
assert(stripes[0].read_start == 0 && stripes[0].read_end == 128*1024);
assert(stripes[1].read_start == 0 && stripes[1].read_end == 0);
assert(stripes[2].read_start == 0 && stripes[2].read_end == 128*1024);
assert(stripes[3].read_start == 0 && stripes[3].read_end == 128*1024);
read_buf = alloc_read_buffer(stripes, 4, 0);
assert(read_buf);
assert(stripes[0].read_buf == read_buf);
assert(stripes[1].read_buf == NULL);
assert(stripes[2].read_buf == read_buf+128*1024);
assert(stripes[3].read_buf == read_buf+2*128*1024);
memcpy(read_buf+128*1024, rmw_buf, 128*1024);
memcpy(read_buf+2*128*1024, rmw_buf+128*1024, 128*1024);
reconstruct_stripes_jerasure(stripes, 4, 2);
check_pattern(stripes[0].read_buf, 128*1024-4096, PATTERN1);
check_pattern(stripes[0].read_buf+128*1024-4096, 4096, PATTERN3);
free(read_buf);
// Huh done
free(rmw_buf);
free(write_buf);
use_jerasure(4, 2, false);
}
/***
13. basic jerasure 2+1 test
calc_rmw(offset=128K-4K, len=8K, osd_set=[1,2,0], write_set=[1,2,3])
= {
read: [ [ 0, 128K ], [ 0, 128K ], [ 0, 0 ] ],
write: [ [ 128K-4K, 128K ], [ 0, 4K ], [ 0, 128K ] ],
input buffer: [ write0, write1 ],
rmw buffer: [ write2, read0, read1 ],
}
then, after calc_rmw_parity_jerasure(): all the same
then simulate read with read_osd_set=[0,2,3] and check read0 buffer
***/
void test14()
{
use_jerasure(3, 2, true);
osd_num_t osd_set[3] = { 1, 2, 0 };
osd_num_t write_osd_set[3] = { 1, 2, 3 };
osd_rmw_stripe_t stripes[3] = { 0 };
// Test 13.0
void *write_buf = malloc_or_die(8192);
split_stripes(2, 128*1024, 128*1024-4096, 8192, stripes);
assert(stripes[0].req_start == 128*1024-4096 && stripes[0].req_end == 128*1024);
assert(stripes[1].req_start == 0 && stripes[1].req_end == 4096);
assert(stripes[2].req_start == 0 && stripes[2].req_end == 0);
// Test 13.1
void *rmw_buf = calc_rmw(write_buf, stripes, osd_set, 3, 2, 3, write_osd_set, 128*1024);
assert(rmw_buf);
assert(stripes[0].read_start == 0 && stripes[0].read_end == 128*1024-4096);
assert(stripes[1].read_start == 4096 && stripes[1].read_end == 128*1024);
assert(stripes[2].read_start == 0 && stripes[2].read_end == 0);
assert(stripes[0].write_start == 128*1024-4096 && stripes[0].write_end == 128*1024);
assert(stripes[1].write_start == 0 && stripes[1].write_end == 4096);
assert(stripes[2].write_start == 0 && stripes[2].write_end == 128*1024);
assert(stripes[0].read_buf == rmw_buf+128*1024);
assert(stripes[1].read_buf == rmw_buf+2*128*1024-4096);
assert(stripes[2].read_buf == NULL);
assert(stripes[0].write_buf == write_buf);
assert(stripes[1].write_buf == write_buf+4096);
assert(stripes[2].write_buf == rmw_buf);
// Test 13.2 - encode
set_pattern(write_buf, 8192, PATTERN3);
set_pattern(stripes[0].read_buf, 128*1024-4096, PATTERN1);
set_pattern(stripes[1].read_buf, 128*1024-4096, PATTERN2);
calc_rmw_parity_jerasure(stripes, 3, 2, osd_set, write_osd_set, 128*1024);
assert(stripes[0].write_start == 128*1024-4096 && stripes[0].write_end == 128*1024);
assert(stripes[1].write_start == 0 && stripes[1].write_end == 4096);
assert(stripes[2].write_start == 0 && stripes[2].write_end == 128*1024);
assert(stripes[0].write_buf == write_buf);
assert(stripes[1].write_buf == write_buf+4096);
assert(stripes[2].write_buf == rmw_buf);
// Test 13.3 - decode and verify
osd_num_t read_osd_set[4] = { 0, 2, 3 };
memset(stripes, 0, sizeof(stripes));
split_stripes(2, 128*1024, 0, 128*1024, stripes);
assert(stripes[0].req_start == 0 && stripes[0].req_end == 128*1024);
assert(stripes[1].req_start == 0 && stripes[1].req_end == 0);
assert(stripes[2].req_start == 0 && stripes[2].req_end == 0);
for (int role = 0; role < 3; role++)
{
stripes[role].read_start = stripes[role].req_start;
stripes[role].read_end = stripes[role].req_end;
}
assert(extend_missing_stripes(stripes, read_osd_set, 2, 3) == 0);
assert(stripes[0].read_start == 0 && stripes[0].read_end == 128*1024);
assert(stripes[1].read_start == 0 && stripes[1].read_end == 128*1024);
assert(stripes[2].read_start == 0 && stripes[2].read_end == 128*1024);
void *read_buf = alloc_read_buffer(stripes, 3, 0);
assert(read_buf);
assert(stripes[0].read_buf == read_buf);
assert(stripes[1].read_buf == read_buf+128*1024);
assert(stripes[2].read_buf == read_buf+2*128*1024);
set_pattern(stripes[1].read_buf, 4096, PATTERN3);
set_pattern(stripes[1].read_buf+4096, 128*1024-4096, PATTERN2);
memcpy(stripes[2].read_buf, rmw_buf, 128*1024);
reconstruct_stripes_jerasure(stripes, 3, 2);
check_pattern(stripes[0].read_buf, 128*1024-4096, PATTERN1);
check_pattern(stripes[0].read_buf+128*1024-4096, 4096, PATTERN3);
free(read_buf);
// Huh done
free(rmw_buf);
free(write_buf);
use_jerasure(3, 2, false);
}

84
qemu-3.1-vitastor.patch Normal file
View File

@@ -0,0 +1,84 @@
Index: qemu-3.1+dfsg/qapi/block-core.json
===================================================================
--- qemu-3.1+dfsg.orig/qapi/block-core.json
+++ qemu-3.1+dfsg/qapi/block-core.json
@@ -2617,7 +2617,7 @@
##
{ 'enum': 'BlockdevDriver',
'data': [ 'blkdebug', 'blklogwrites', 'blkverify', 'bochs', 'cloop',
- 'copy-on-read', 'dmg', 'file', 'ftp', 'ftps', 'gluster',
+ 'copy-on-read', 'dmg', 'file', 'ftp', 'ftps', 'gluster', 'vitastor',
'host_cdrom', 'host_device', 'http', 'https', 'iscsi', 'luks',
'nbd', 'nfs', 'null-aio', 'null-co', 'nvme', 'parallels', 'qcow',
'qcow2', 'qed', 'quorum', 'raw', 'rbd', 'replication', 'sheepdog',
@@ -3367,6 +3367,24 @@
'*tag': 'str' } }
##
+# @BlockdevOptionsVitastor:
+#
+# Driver specific block device options for vitastor
+#
+# @inode: Inode number
+# @pool: Pool ID
+# @size: Desired image size in bytes
+# @etcd_host: etcd connection address
+# @etcd_prefix: etcd key/value prefix
+##
+{ 'struct': 'BlockdevOptionsVitastor',
+ 'data': { 'inode': 'uint64',
+ 'pool': 'uint64',
+ 'size': 'uint64',
+ 'etcd_host': 'str',
+ '*etcd_prefix': 'str' } }
+
+##
# @ReplicationMode:
#
# An enumeration of replication modes.
@@ -3713,6 +3731,7 @@
'rbd': 'BlockdevOptionsRbd',
'replication':'BlockdevOptionsReplication',
'sheepdog': 'BlockdevOptionsSheepdog',
+ 'vitastor': 'BlockdevOptionsVitastor',
'ssh': 'BlockdevOptionsSsh',
'throttle': 'BlockdevOptionsThrottle',
'vdi': 'BlockdevOptionsGenericFormat',
@@ -4158,6 +4177,17 @@
'*block-state-zero': 'bool' } }
##
+# @BlockdevCreateOptionsVitastor:
+#
+# Driver specific image creation options for Vitastor.
+#
+# @size: Size of the virtual disk in bytes
+##
+{ 'struct': 'BlockdevCreateOptionsVitastor',
+ 'data': { 'location': 'BlockdevOptionsVitastor',
+ 'size': 'size' } }
+
+##
# @BlockdevVpcSubformat:
#
# @dynamic: Growing image file
@@ -4212,6 +4242,7 @@
'qed': 'BlockdevCreateOptionsQed',
'rbd': 'BlockdevCreateOptionsRbd',
'sheepdog': 'BlockdevCreateOptionsSheepdog',
+ 'vitastor': 'BlockdevCreateOptionsVitastor',
'ssh': 'BlockdevCreateOptionsSsh',
'vdi': 'BlockdevCreateOptionsVdi',
'vhdx': 'BlockdevCreateOptionsVhdx',
Index: qemu-3.1+dfsg/scripts/modules/module_block.py
===================================================================
--- qemu-3.1+dfsg.orig/scripts/modules/module_block.py
+++ qemu-3.1+dfsg/scripts/modules/module_block.py
@@ -88,6 +88,7 @@ def print_bottom(fheader):
output_file = sys.argv[1]
with open(output_file, 'w') as fheader:
print_top(fheader)
+ add_module(fheader, "vitastor", "vitastor", "vitastor")
for filename in sys.argv[2:]:
if os.path.isfile(filename):

84
qemu-4.2-vitastor.patch Normal file
View File

@@ -0,0 +1,84 @@
Index: qemu/qapi/block-core.json
===================================================================
--- qemu.orig/qapi/block-core.json 2020-11-07 22:57:38.932613674 +0000
+++ qemu.orig/qapi/block-core.json 2020-11-07 22:59:49.890722862 +0000
@@ -2907,7 +2907,7 @@
'nbd', 'nfs', 'null-aio', 'null-co', 'nvme', 'parallels', 'qcow',
'qcow2', 'qed', 'quorum', 'raw', 'rbd',
{ 'name': 'replication', 'if': 'defined(CONFIG_REPLICATION)' },
- 'sheepdog',
+ 'sheepdog', 'vitastor',
'ssh', 'throttle', 'vdi', 'vhdx', 'vmdk', 'vpc', 'vvfat', 'vxhs' ] }
##
@@ -3725,6 +3725,24 @@
'*tag': 'str' } }
##
+# @BlockdevOptionsVitastor:
+#
+# Driver specific block device options for vitastor
+#
+# @inode: Inode number
+# @pool: Pool ID
+# @size: Desired image size in bytes
+# @etcd_host: etcd connection address
+# @etcd_prefix: etcd key/value prefix
+##
+{ 'struct': 'BlockdevOptionsVitastor',
+ 'data': { 'inode': 'uint64',
+ 'pool': 'uint64',
+ 'size': 'uint64',
+ 'etcd_host': 'str',
+ '*etcd_prefix': 'str' } }
+
+##
# @ReplicationMode:
#
# An enumeration of replication modes.
@@ -4084,6 +4102,7 @@
'replication': { 'type': 'BlockdevOptionsReplication',
'if': 'defined(CONFIG_REPLICATION)' },
'sheepdog': 'BlockdevOptionsSheepdog',
+ 'vitastor': 'BlockdevOptionsVitastor',
'ssh': 'BlockdevOptionsSsh',
'throttle': 'BlockdevOptionsThrottle',
'vdi': 'BlockdevOptionsGenericFormat',
@@ -4461,6 +4480,17 @@
'*cluster-size' : 'size' } }
##
+# @BlockdevCreateOptionsVitastor:
+#
+# Driver specific image creation options for Vitastor.
+#
+# @size: Size of the virtual disk in bytes
+##
+{ 'struct': 'BlockdevCreateOptionsVitastor',
+ 'data': { 'location': 'BlockdevOptionsVitastor',
+ 'size': 'size' } }
+
+##
# @BlockdevVmdkSubformat:
#
# Subformat options for VMDK images
@@ -4722,6 +4752,7 @@
'qed': 'BlockdevCreateOptionsQed',
'rbd': 'BlockdevCreateOptionsRbd',
'sheepdog': 'BlockdevCreateOptionsSheepdog',
+ 'vitastor': 'BlockdevCreateOptionsVitastor',
'ssh': 'BlockdevCreateOptionsSsh',
'vdi': 'BlockdevCreateOptionsVdi',
'vhdx': 'BlockdevCreateOptionsVhdx',
Index: qemu/scripts/modules/module_block.py
===================================================================
--- qemu.orig/scripts/modules/module_block.py 2020-11-07 22:57:38.936613739 +0000
+++ qemu/scripts/modules/module_block.py 2020-11-07 22:59:49.890722862 +0000
@@ -86,6 +86,7 @@ def print_bottom(fheader):
output_file = sys.argv[1]
with open(output_file, 'w') as fheader:
print_top(fheader)
+ add_module(fheader, "vitastor", "vitastor", "vitastor")
for filename in sys.argv[2:]:
if os.path.isfile(filename):

84
qemu-5.0-vitastor.patch Normal file
View File

@@ -0,0 +1,84 @@
Index: qemu/qapi/block-core.json
===================================================================
--- qemu.orig/qapi/block-core.json
+++ qemu/qapi/block-core.json
@@ -2798,7 +2798,7 @@
'luks', 'nbd', 'nfs', 'null-aio', 'null-co', 'nvme', 'parallels',
'qcow', 'qcow2', 'qed', 'quorum', 'raw', 'rbd',
{ 'name': 'replication', 'if': 'defined(CONFIG_REPLICATION)' },
- 'sheepdog',
+ 'sheepdog', 'vitastor',
'ssh', 'throttle', 'vdi', 'vhdx', 'vmdk', 'vpc', 'vvfat', 'vxhs' ] }
##
@@ -3635,6 +3635,24 @@
'*tag': 'str' } }
##
+# @BlockdevOptionsVitastor:
+#
+# Driver specific block device options for vitastor
+#
+# @inode: Inode number
+# @pool: Pool ID
+# @size: Desired image size in bytes
+# @etcd_host: etcd connection address
+# @etcd_prefix: etcd key/value prefix
+##
+{ 'struct': 'BlockdevOptionsVitastor',
+ 'data': { 'inode': 'uint64',
+ 'pool': 'uint64',
+ 'size': 'uint64',
+ 'etcd_host': 'str',
+ '*etcd_prefix': 'str' } }
+
+##
# @ReplicationMode:
#
# An enumeration of replication modes.
@@ -3995,6 +4013,7 @@
'replication': { 'type': 'BlockdevOptionsReplication',
'if': 'defined(CONFIG_REPLICATION)' },
'sheepdog': 'BlockdevOptionsSheepdog',
+ 'vitastor': 'BlockdevOptionsVitastor',
'ssh': 'BlockdevOptionsSsh',
'throttle': 'BlockdevOptionsThrottle',
'vdi': 'BlockdevOptionsGenericFormat',
@@ -4365,6 +4384,17 @@
'*cluster-size' : 'size' } }
##
+# @BlockdevCreateOptionsVitastor:
+#
+# Driver specific image creation options for Vitastor.
+#
+# @size: Size of the virtual disk in bytes
+##
+{ 'struct': 'BlockdevCreateOptionsVitastor',
+ 'data': { 'location': 'BlockdevOptionsVitastor',
+ 'size': 'size' } }
+
+##
# @BlockdevVmdkSubformat:
#
# Subformat options for VMDK images
@@ -4626,6 +4656,7 @@
'qed': 'BlockdevCreateOptionsQed',
'rbd': 'BlockdevCreateOptionsRbd',
'sheepdog': 'BlockdevCreateOptionsSheepdog',
+ 'vitastor': 'BlockdevCreateOptionsVitastor',
'ssh': 'BlockdevCreateOptionsSsh',
'vdi': 'BlockdevCreateOptionsVdi',
'vhdx': 'BlockdevCreateOptionsVhdx',
Index: qemu/scripts/modules/module_block.py
===================================================================
--- qemu.orig/scripts/modules/module_block.py
+++ qemu/scripts/modules/module_block.py
@@ -85,6 +85,7 @@ def print_bottom(fheader):
output_file = sys.argv[1]
with open(output_file, 'w') as fheader:
print_top(fheader)
+ add_module(fheader, "vitastor", "vitastor", "vitastor")
for filename in sys.argv[2:]:
if os.path.isfile(filename):

84
qemu-5.1-vitastor.patch Normal file
View File

@@ -0,0 +1,84 @@
Index: qemu-5.1+dfsg/qapi/block-core.json
===================================================================
--- qemu-5.1+dfsg.orig/qapi/block-core.json
+++ qemu-5.1+dfsg/qapi/block-core.json
@@ -2807,7 +2807,7 @@
'luks', 'nbd', 'nfs', 'null-aio', 'null-co', 'nvme', 'parallels',
'qcow', 'qcow2', 'qed', 'quorum', 'raw', 'rbd',
{ 'name': 'replication', 'if': 'defined(CONFIG_REPLICATION)' },
- 'sheepdog',
+ 'sheepdog', 'vitastor',
'ssh', 'throttle', 'vdi', 'vhdx', 'vmdk', 'vpc', 'vvfat' ] }
##
@@ -3644,6 +3644,24 @@
'*tag': 'str' } }
##
+# @BlockdevOptionsVitastor:
+#
+# Driver specific block device options for vitastor
+#
+# @inode: Inode number
+# @pool: Pool ID
+# @size: Desired image size in bytes
+# @etcd_host: etcd connection address
+# @etcd_prefix: etcd key/value prefix
+##
+{ 'struct': 'BlockdevOptionsVitastor',
+ 'data': { 'inode': 'uint64',
+ 'pool': 'uint64',
+ 'size': 'uint64',
+ 'etcd_host': 'str',
+ '*etcd_prefix': 'str' } }
+
+##
# @ReplicationMode:
#
# An enumeration of replication modes.
@@ -3988,6 +4006,7 @@
'replication': { 'type': 'BlockdevOptionsReplication',
'if': 'defined(CONFIG_REPLICATION)' },
'sheepdog': 'BlockdevOptionsSheepdog',
+ 'vitastor': 'BlockdevOptionsVitastor',
'ssh': 'BlockdevOptionsSsh',
'throttle': 'BlockdevOptionsThrottle',
'vdi': 'BlockdevOptionsGenericFormat',
@@ -4376,6 +4395,17 @@
'*cluster-size' : 'size' } }
##
+# @BlockdevCreateOptionsVitastor:
+#
+# Driver specific image creation options for Vitastor.
+#
+# @size: Size of the virtual disk in bytes
+##
+{ 'struct': 'BlockdevCreateOptionsVitastor',
+ 'data': { 'location': 'BlockdevOptionsVitastor',
+ 'size': 'size' } }
+
+##
# @BlockdevVmdkSubformat:
#
# Subformat options for VMDK images
@@ -4637,6 +4667,7 @@
'qed': 'BlockdevCreateOptionsQed',
'rbd': 'BlockdevCreateOptionsRbd',
'sheepdog': 'BlockdevCreateOptionsSheepdog',
+ 'vitastor': 'BlockdevCreateOptionsVitastor',
'ssh': 'BlockdevCreateOptionsSsh',
'vdi': 'BlockdevCreateOptionsVdi',
'vhdx': 'BlockdevCreateOptionsVhdx',
Index: qemu-5.1+dfsg/scripts/modules/module_block.py
===================================================================
--- qemu-5.1+dfsg.orig/scripts/modules/module_block.py
+++ qemu-5.1+dfsg/scripts/modules/module_block.py
@@ -86,6 +86,7 @@ if __name__ == '__main__':
output_file = sys.argv[1]
with open(output_file, 'w') as fheader:
print_top(fheader)
+ add_module(fheader, "vitastor", "vitastor", "vitastor")
for filename in sys.argv[2:]:
if os.path.isfile(filename):

View File

@@ -3,11 +3,10 @@
// QEMU block driver // QEMU block driver
#define BUILD_DSO
#define _GNU_SOURCE #define _GNU_SOURCE
#include "qemu/osdep.h" #include "qemu/osdep.h"
#include "qemu/units.h"
#include "block/block_int.h" #include "block/block_int.h"
#include "block/qdict.h"
#include "qapi/error.h" #include "qapi/error.h"
#include "qapi/qmp/qdict.h" #include "qapi/qmp/qdict.h"
#include "qapi/qmp/qerror.h" #include "qapi/qmp/qerror.h"
@@ -15,10 +14,28 @@
#include "qemu/error-report.h" #include "qemu/error-report.h"
#include "qemu/module.h" #include "qemu/module.h"
#include "qemu/option.h" #include "qemu/option.h"
#if QEMU_VERSION_MAJOR >= 3
#include "qemu/units.h"
#include "block/qdict.h"
#include "qemu/cutils.h" #include "qemu/cutils.h"
#else
#include "qapi/qmp/qint.h"
#define qdict_put_int(options, name, num_val) qdict_put_obj(options, name, QOBJECT(qint_from_int(num_val)))
#define qdict_put_str(options, name, value) qdict_put_obj(options, name, QOBJECT(qstring_from_str(value)))
#define qobject_unref QDECREF
#endif
#include "qemu_proxy.h" #include "qemu_proxy.h"
void qemu_module_dummy(void)
{
}
void DSO_STAMP_FUN(void)
{
}
typedef struct VitastorClient typedef struct VitastorClient
{ {
void *proxy; void *proxy;
@@ -176,12 +193,14 @@ static void vitastor_close(BlockDriverState *bs)
g_free(client->etcd_prefix); g_free(client->etcd_prefix);
} }
#if QEMU_VERSION_MAJOR >= 3
static int vitastor_probe_blocksizes(BlockDriverState *bs, BlockSizes *bsz) static int vitastor_probe_blocksizes(BlockDriverState *bs, BlockSizes *bsz)
{ {
bsz->phys = 4096; bsz->phys = 4096;
bsz->log = 4096; bsz->log = 4096;
return 0; return 0;
} }
#endif
static int coroutine_fn vitastor_co_create_opts( static int coroutine_fn vitastor_co_create_opts(
#if QEMU_VERSION_MAJOR >= 4 #if QEMU_VERSION_MAJOR >= 4
@@ -208,6 +227,7 @@ out:
return ret; return ret;
} }
#if QEMU_VERSION_MAJOR >= 3
static int coroutine_fn vitastor_co_truncate(BlockDriverState *bs, int64_t offset, static int coroutine_fn vitastor_co_truncate(BlockDriverState *bs, int64_t offset,
#if QEMU_VERSION_MAJOR >= 4 #if QEMU_VERSION_MAJOR >= 4
bool exact, bool exact,
@@ -231,6 +251,7 @@ static int coroutine_fn vitastor_co_truncate(BlockDriverState *bs, int64_t offse
return 0; return 0;
} }
#endif
static int vitastor_get_info(BlockDriverState *bs, BlockDriverInfo *bdi) static int vitastor_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
{ {
@@ -244,11 +265,22 @@ static int64_t vitastor_getlength(BlockDriverState *bs)
return client->size; return client->size;
} }
#if QEMU_VERSION_MAJOR >= 3
static void vitastor_refresh_limits(BlockDriverState *bs, Error **errp) static void vitastor_refresh_limits(BlockDriverState *bs, Error **errp)
#else
static int vitastor_refresh_limits(BlockDriverState *bs)
#endif
{ {
#if QEMU_VERSION_MAJOR >= 4
bs->bl.request_alignment = 4096; bs->bl.request_alignment = 4096;
bs->bl.min_mem_alignment = 4096; bs->bl.min_mem_alignment = 4096;
#else
bs->request_alignment = 4096;
#endif
bs->bl.opt_mem_alignment = 4096; bs->bl.opt_mem_alignment = 4096;
#if QEMU_VERSION_MAJOR < 3
return 0;
#endif
} }
static int64_t vitastor_get_allocated_file_size(BlockDriverState *bs) static int64_t vitastor_get_allocated_file_size(BlockDriverState *bs)
@@ -271,7 +303,12 @@ static void vitastor_co_generic_bh_cb(int retval, void *opaque)
task->complete = 1; task->complete = 1;
if (qemu_coroutine_self() != task->co) if (qemu_coroutine_self() != task->co)
{ {
#if QEMU_VERSION_MAJOR >= 3
aio_co_wake(task->co); aio_co_wake(task->co);
#else
qemu_coroutine_enter(task->co, NULL);
qemu_aio_release(task);
#endif
} }
} }
@@ -313,6 +350,18 @@ static int coroutine_fn vitastor_co_pwritev(BlockDriverState *bs, uint64_t offse
return task.ret; return task.ret;
} }
#if QEMU_VERSION_MAJOR < 3
static int coroutine_fn vitastor_co_readv(BlockDriverState *bs, int64_t sector_num, int nb_sectors, QEMUIOVector *iov)
{
return vitastor_co_preadv(bs, sector_num*BDRV_SECTOR_SIZE, nb_sectors*BDRV_SECTOR_SIZE, iov, 0);
}
static int coroutine_fn vitastor_co_writev(BlockDriverState *bs, int64_t sector_num, int nb_sectors, QEMUIOVector *iov)
{
return vitastor_co_pwritev(bs, sector_num*BDRV_SECTOR_SIZE, nb_sectors*BDRV_SECTOR_SIZE, iov, 0);
}
#endif
static int coroutine_fn vitastor_co_flush(BlockDriverState *bs) static int coroutine_fn vitastor_co_flush(BlockDriverState *bs)
{ {
VitastorClient *client = bs->opaque; VitastorClient *client = bs->opaque;
@@ -331,6 +380,7 @@ static int coroutine_fn vitastor_co_flush(BlockDriverState *bs)
return task.ret; return task.ret;
} }
#if QEMU_VERSION_MAJOR >= 3
static QemuOptsList vitastor_create_opts = { static QemuOptsList vitastor_create_opts = {
.name = "vitastor-create-opts", .name = "vitastor-create-opts",
.head = QTAILQ_HEAD_INITIALIZER(vitastor_create_opts.head), .head = QTAILQ_HEAD_INITIALIZER(vitastor_create_opts.head),
@@ -343,6 +393,16 @@ static QemuOptsList vitastor_create_opts = {
{ /* end of list */ } { /* end of list */ }
} }
}; };
#else
static QEMUOptionParameter vitastor_create_opts[] = {
{
.name = BLOCK_OPT_SIZE,
.type = OPT_SIZE,
.help = "Virtual disk size"
},
{ NULL }
};
#endif
static const char *vitastor_strong_runtime_opts[] = { static const char *vitastor_strong_runtime_opts[] = {
"inode", "inode",
@@ -363,7 +423,9 @@ static BlockDriver bdrv_vitastor = {
.bdrv_has_zero_init = bdrv_has_zero_init_1, .bdrv_has_zero_init = bdrv_has_zero_init_1,
.bdrv_get_info = vitastor_get_info, .bdrv_get_info = vitastor_get_info,
.bdrv_getlength = vitastor_getlength, .bdrv_getlength = vitastor_getlength,
#if QEMU_VERSION_MAJOR >= 3
.bdrv_probe_blocksizes = vitastor_probe_blocksizes, .bdrv_probe_blocksizes = vitastor_probe_blocksizes,
#endif
.bdrv_refresh_limits = vitastor_refresh_limits, .bdrv_refresh_limits = vitastor_refresh_limits,
// FIXME: Implement it along with per-inode statistics // FIXME: Implement it along with per-inode statistics
@@ -373,12 +435,17 @@ static BlockDriver bdrv_vitastor = {
.bdrv_close = vitastor_close, .bdrv_close = vitastor_close,
// Option list for the create operation // Option list for the create operation
#if QEMU_VERSION_MAJOR >= 3
.create_opts = &vitastor_create_opts, .create_opts = &vitastor_create_opts,
#else
.create_options = vitastor_create_opts,
#endif
// For qmp_blockdev_create(), used by the qemu monitor / QAPI // For qmp_blockdev_create(), used by the qemu monitor / QAPI
// Requires patching QAPI IDL, thus unimplemented // Requires patching QAPI IDL, thus unimplemented
//.bdrv_co_create = vitastor_co_create, //.bdrv_co_create = vitastor_co_create,
#if QEMU_VERSION_MAJOR >= 3
// For bdrv_create(), used by qemu-img // For bdrv_create(), used by qemu-img
.bdrv_co_create_opts = vitastor_co_create_opts, .bdrv_co_create_opts = vitastor_co_create_opts,
@@ -386,6 +453,11 @@ static BlockDriver bdrv_vitastor = {
.bdrv_co_preadv = vitastor_co_preadv, .bdrv_co_preadv = vitastor_co_preadv,
.bdrv_co_pwritev = vitastor_co_pwritev, .bdrv_co_pwritev = vitastor_co_pwritev,
#else
.bdrv_co_readv = vitastor_co_readv,
.bdrv_co_writev = vitastor_co_writev,
#endif
.bdrv_co_flush_to_disk = vitastor_co_flush, .bdrv_co_flush_to_disk = vitastor_co_flush,
#if QEMU_VERSION_MAJOR >= 4 #if QEMU_VERSION_MAJOR >= 4

View File

@@ -170,7 +170,7 @@ public:
osd_op_t *op = new osd_op_t(); osd_op_t *op = new osd_op_t();
op->op_type = OSD_OP_OUT; op->op_type = OSD_OP_OUT;
op->peer_fd = cli->msgr.osd_peer_fds[cur_list->osd_num]; op->peer_fd = cli->msgr.osd_peer_fds[cur_list->osd_num];
op->req = { op->req = (osd_any_op_t){
.sec_list = { .sec_list = {
.header = { .header = {
.magic = SECONDARY_OSD_OP_MAGIC, .magic = SECONDARY_OSD_OP_MAGIC,
@@ -233,7 +233,7 @@ public:
osd_op_t *op = new osd_op_t(); osd_op_t *op = new osd_op_t();
op->op_type = OSD_OP_OUT; op->op_type = OSD_OP_OUT;
op->peer_fd = cli->msgr.osd_peer_fds[cur_list->osd_num]; op->peer_fd = cli->msgr.osd_peer_fds[cur_list->osd_num];
op->req = { op->req = (osd_any_op_t){
.rw = { .rw = {
.header = { .header = {
.magic = SECONDARY_OSD_OP_MAGIC, .magic = SECONDARY_OSD_OP_MAGIC,

51
rpm/build-tarball.sh Executable file
View File

@@ -0,0 +1,51 @@
#!/bin/bash
# Vitastor depends on QEMU and FIO headers, but QEMU and FIO don't have -devel packages
# So we have to copy their headers into the source tarball
set -e
VITASTOR=$(dirname $0)
VITASTOR=$(realpath "$VITASTOR/..")
if [ -d /opt/rh/gcc-toolset-9 ]; then
# CentOS 8
EL=8
. /opt/rh/gcc-toolset-9/enable
else
# CentOS 7
EL=7
. /opt/rh/devtoolset-9/enable
fi
cd ~/rpmbuild/SPECS
rpmbuild -bp fio.spec
perl -i -pe 's/^make V=1/exit 0; make V=1/' qemu*.spec
rpmbuild -bc qemu*.spec
perl -i -pe 's/^exit 0; make V=1/make V=1/' qemu*.spec
cd ~/rpmbuild/BUILD/qemu*/
rm -rf $VITASTOR/qemu $VITASTOR/fio
mkdir -p $VITASTOR/qemu/b/qemu
make -j8 config-host.h
cp config-host.h $VITASTOR/qemu/b/qemu
cp -r include $VITASTOR/qemu
if [ -f qapi-schema.json ]; then
# QEMU 2.0
make qapi-types.h
cp qapi-types.h $VITASTOR/qemu/b/qemu
else
# QEMU 3.0+
make qapi
cp -r qapi $VITASTOR/qemu/b/qemu
fi
cd $VITASTOR
sh copy-qemu-includes.sh
rm -rf qemu
mv qemu-copy qemu
ln -s ~/rpmbuild/BUILD/fio*/ fio
sh copy-fio-includes.sh
rm fio
mv fio-copy fio
FIO=`rpm -qi fio | perl -e 'while(<>) { /^Epoch[\s:]+(\S+)/ && print "$1:"; /^Version[\s:]+(\S+)/ && print $1; /^Release[\s:]+(\S+)/ && print "-$1"; }'`
QEMU=`rpm -qi qemu qemu-kvm | perl -e 'while(<>) { /^Epoch[\s:]+(\S+)/ && print "$1:"; /^Version[\s:]+(\S+)/ && print $1; /^Release[\s:]+(\S+)/ && print "-$1"; }'`
perl -i -pe 's/(Requires:\s*fio)([^\n]+)?/$1 = '$FIO'/' $VITASTOR/rpm/vitastor-el$EL.spec
perl -i -pe 's/(Requires:\s*qemu(?:-kvm)?)([^\n]+)?/$1 = '$QEMU'/' $VITASTOR/rpm/vitastor-el$EL.spec
tar --transform 's#^#vitastor-0.5/#' --exclude 'rpm/*.rpm' -czf $VITASTOR/../vitastor-0.5$(rpm --eval '%dist').tar.gz *

20
rpm/qemu-kvm.spec.patch Normal file
View File

@@ -0,0 +1,20 @@
--- qemu-kvm.spec 2020-11-07 22:48:46.312124920 +0000
+++ qemu-kvm.spec 2020-11-07 23:04:06.246772766 +0000
@@ -67,7 +67,7 @@ Obsoletes: %1-rhev
Summary: QEMU is a machine emulator and virtualizer
Name: qemu-kvm
Version: 4.2.0
-Release: 29%{?dist}.6
+Release: 29.vitastor%{?dist}.6
# Epoch because we pushed a qemu-1.0 package. AIUI this can't ever be dropped
Epoch: 15
License: GPLv2 and GPLv2+ and CC-BY
@@ -825,6 +825,8 @@ Patch331: kvm-Drop-bogus-IPv6-messages.p
Patch333: kvm-virtiofsd-Whitelist-fchmod.patch
# For bz#1883869 - virtiofsd core dump in KATA Container [rhel-8.2.1.z]
Patch334: kvm-virtiofsd-avoid-proc-self-fd-tempdir.patch
+# Vitastor
+Patch335: qemu-4.2-vitastor.patch
BuildRequires: wget
BuildRequires: rpm-build

View File

@@ -0,0 +1,46 @@
# Build packages for CentOS 7 inside a container
# cd ..; podman build -t vitastor-el7 -v `pwd`/build:/root/build -f rpm/vitastor-el7.Dockerfile .
# localedef -i ru_RU -f UTF-8 ru_RU.UTF-8
FROM centos:7
WORKDIR /root
RUN yum -y --enablerepo=extras install centos-release-scl epel-release yum-utils rpm-build
RUN yum -y install https://vitastor.io/rpms/centos/7/vitastor-release-1.0-1.el7.noarch.rpm
RUN yum -y install devtoolset-9-gcc-c++ devtoolset-9-libatomic-devel gperftools-devel qemu-kvm fio rh-nodejs12
RUN yumdownloader --disablerepo=centos-sclo-rh --source qemu-kvm
RUN yumdownloader --disablerepo=centos-sclo-rh --source fio
RUN rpm --nomd5 -i qemu*.src.rpm
RUN rpm --nomd5 -i fio*.src.rpm
RUN rm -f /etc/yum.repos.d/CentOS-Media.repo
RUN cd ~/rpmbuild/SPECS && yum-builddep -y --enablerepo='*' --disablerepo=centos-sclo-rh --disablerepo=centos-sclo-rh-source --disablerepo=centos-sclo-sclo-testing qemu-kvm.spec
RUN cd ~/rpmbuild/SPECS && yum-builddep -y --enablerepo='*' --disablerepo=centos-sclo-rh --disablerepo=centos-sclo-rh-source --disablerepo=centos-sclo-sclo-testing fio.spec
ADD https://vitastor.io/rpms/liburing-el7/liburing-0.7-2.el7.src.rpm /root
RUN set -e; \
rpm -i liburing*.src.rpm; \
cd ~/rpmbuild/SPECS/; \
. /opt/rh/devtoolset-9/enable; \
rpmbuild -ba liburing.spec; \
mkdir -p /root/build/liburing-el7; \
rm -rf /root/build/liburing-el7/*; \
cp ~/rpmbuild/RPMS/*/liburing* /root/build/liburing-el7/; \
cp ~/rpmbuild/SRPMS/liburing* /root/build/liburing-el7/
RUN rpm -i `ls /root/build/liburing-el7/liburing-*.x86_64.rpm | grep -v debug`
ADD . /root/vitastor
RUN set -e; \
cd /root/vitastor/rpm; \
sh build-tarball.sh; \
cp /root/vitastor-0.5.el7.tar.gz ~/rpmbuild/SOURCES; \
cp vitastor-el7.spec ~/rpmbuild/SPECS/vitastor.spec; \
cd ~/rpmbuild/SPECS/; \
rpmbuild -ba vitastor.spec; \
mkdir -p /root/build/vitastor-el7; \
rm -rf /root/build/vitastor-el7/*; \
cp ~/rpmbuild/RPMS/*/vitastor* /root/build/vitastor-el7/; \
cp ~/rpmbuild/SRPMS/vitastor* /root/build/vitastor-el7/

59
rpm/vitastor-el7.spec Normal file
View File

@@ -0,0 +1,59 @@
Name: vitastor
Version: 0.5
Release: 2%{?dist}
Summary: Vitastor, a fast software-defined clustered block storage
License: Vitastor Network Public License 1.0
URL: https://vitastor.io/
Source0: vitastor-0.5.el7.tar.gz
BuildRequires: liburing-devel >= 0.6
BuildRequires: gperftools-devel
BuildRequires: devtoolset-9-gcc-c++
BuildRequires: rh-nodejs12
BuildRequires: rh-nodejs12-npm
Requires: fio = 3.7-1.el7
Requires: qemu-kvm = 2.0.0-1.el7.6
Requires: rh-nodejs12
Requires: rh-nodejs12-npm
Requires: liburing >= 0.6
%description
Vitastor is a small, simple and fast clustered block storage (storage for VM drives),
architecturally similar to Ceph which means strong consistency, primary-replication,
symmetric clustering and automatic data distribution over any number of drives of any
size with configurable redundancy (replication or erasure codes/XOR).
%prep
%setup -q
%build
. /opt/rh/devtoolset-9/enable
make %{?_smp_mflags} BINDIR=%_bindir LIBDIR=%_libdir QEMU_PLUGINDIR=%_libdir/qemu-kvm
%install
rm -rf $RPM_BUILD_ROOT
%make_install BINDIR=%_bindir LIBDIR=%_libdir QEMU_PLUGINDIR=%_libdir/qemu-kvm
. /opt/rh/rh-nodejs12/enable
cd mon
npm install
cd ..
mkdir -p %buildroot/usr/lib/vitastor
cp -r mon %buildroot/usr/lib/vitastor/mon
%files
%doc
%_bindir/vitastor-dump-journal
%_bindir/vitastor-nbd
%_bindir/vitastor-osd
%_bindir/vitastor-rm
%_libdir/qemu-kvm/block-vitastor.so
%_libdir/vitastor
/usr/lib/vitastor
%changelog

View File

@@ -0,0 +1,65 @@
# Build packages for CentOS 8 inside a container
# cd ..; podman build -t vitastor-el8 -v `pwd`/build:/root/build -f rpm/vitastor-el8.Dockerfile .
FROM centos:8
WORKDIR /root
RUN rm -f /etc/yum.repos.d/CentOS-Media.repo
RUN dnf -y install centos-release-advanced-virtualization epel-release dnf-plugins-core
RUN dnf --enablerepo='centos-advanced-virtualization' -y install gcc-toolset-9 gcc-toolset-9-gcc-c++ gperftools-devel qemu-kvm fio nodejs rpm-build
RUN rm -rf /var/lib/dnf/*; dnf download --disablerepo='*' --enablerepo='centos-advanced-virtualization-source' --source qemu-kvm
RUN dnf download --source fio
RUN rpm --nomd5 -i qemu*.src.rpm
RUN rpm --nomd5 -i fio*.src.rpm
RUN cd ~/rpmbuild/SPECS && dnf builddep -y --enablerepo='*' --spec qemu-kvm.spec
RUN cd ~/rpmbuild/SPECS && dnf builddep -y --enablerepo='*' --spec fio.spec
ADD https://vitastor.io/rpms/liburing-el7/liburing-0.7-2.el7.src.rpm /root
RUN set -e; \
rpm -i liburing*.src.rpm; \
cd ~/rpmbuild/SPECS/; \
. /opt/rh/gcc-toolset-9/enable; \
rpmbuild -ba liburing.spec; \
mkdir -p /root/build/liburing-el8; \
rm -rf /root/build/liburing-el8/*; \
cp ~/rpmbuild/RPMS/*/liburing* /root/build/liburing-el8/; \
cp ~/rpmbuild/SRPMS/liburing* /root/build/liburing-el8/
RUN rpm -i `ls /root/build/liburing-el7/liburing-*.x86_64.rpm | grep -v debug`
ADD qemu-*-vitastor.patch /root/vitastor/
RUN set -e; \
mkdir -p /root/build/qemu-el8; \
rm -rf /root/build/qemu-el8/*; \
rpm --nomd5 -i /root/qemu*.src.rpm; \
cd ~/rpmbuild/SPECS; \
PN=$(grep ^Patch qemu-kvm.spec | tail -n1 | perl -pe 's/Patch(\d+).*/$1/'); \
csplit qemu-kvm.spec "/^Patch$PN/"; \
cat xx00 > qemu-kvm.spec; \
head -n 1 xx01 >> qemu-kvm.spec; \
echo "Patch$((PN+1)): qemu-4.2-vitastor.patch" >> qemu-kvm.spec; \
tail -n +2 xx01 >> qemu-kvm.spec; \
perl -i -pe 's/(^Release:\s*\d+)/$1.vitastor/' qemu-kvm.spec; \
cp /root/vitastor/qemu-4.2-vitastor.patch ~/rpmbuild/SOURCES; \
rpmbuild --nocheck -ba qemu-kvm.spec; \
cp ~/rpmbuild/RPMS/*/*qemu* /root/build/qemu-el8/; \
cp ~/rpmbuild/SRPMS/*qemu* /root/build/qemu-el8/
RUN cd /root/build/qemu-el8; dnf -y install `ls qemu*.rpm | grep -vP 'debug|guest|tests|src'`
ADD . /root/vitastor
RUN set -e; \
cd /root/vitastor/rpm; \
sh build-tarball.sh; \
cp /root/vitastor-0.5.el8.tar.gz ~/rpmbuild/SOURCES; \
cp vitastor-el8.spec ~/rpmbuild/SPECS/vitastor.spec; \
cd ~/rpmbuild/SPECS/; \
rpmbuild -ba vitastor.spec; \
mkdir -p /root/build/vitastor-el8; \
rm -rf /root/build/vitastor-el8/*; \
cp ~/rpmbuild/RPMS/*/vitastor* /root/build/vitastor-el8/; \
cp ~/rpmbuild/SRPMS/vitastor* /root/build/vitastor-el8/

56
rpm/vitastor-el8.spec Normal file
View File

@@ -0,0 +1,56 @@
Name: vitastor
Version: 0.5
Release: 2%{?dist}
Summary: Vitastor, a fast software-defined clustered block storage
License: Vitastor Network Public License 1.0
URL: https://vitastor.io/
Source0: vitastor-0.5.el8.tar.gz
BuildRequires: liburing-devel >= 0.6
BuildRequires: gperftools-devel
BuildRequires: gcc-toolset-9-gcc-c++
BuildRequires: nodejs >= 10
Requires: fio = 3.7-3.el8
Requires: qemu-kvm = 4.2.0-29.el8.6
Requires: nodejs >= 10
Requires: liburing >= 0.6
%description
Vitastor is a small, simple and fast clustered block storage (storage for VM drives),
architecturally similar to Ceph which means strong consistency, primary-replication,
symmetric clustering and automatic data distribution over any number of drives of any
size with configurable redundancy (replication or erasure codes/XOR).
%prep
%setup -q
%build
. /opt/rh/gcc-toolset-9/enable
make %{?_smp_mflags} BINDIR=%_bindir LIBDIR=%_libdir QEMU_PLUGINDIR=%_libdir/qemu-kvm
%install
rm -rf $RPM_BUILD_ROOT
%make_install BINDIR=%_bindir LIBDIR=%_libdir QEMU_PLUGINDIR=%_libdir/qemu-kvm
cd mon
npm install
cd ..
mkdir -p %buildroot/usr/lib/vitastor
cp -r mon %buildroot/usr/lib/vitastor
%files
%doc
%_bindir/vitastor-dump-journal
%_bindir/vitastor-nbd
%_bindir/vitastor-osd
%_bindir/vitastor-rm
%_libdir/qemu-kvm/block-vitastor.so
%_libdir/vitastor
/usr/lib/vitastor
%changelog

34
test-build-el7.sh Normal file
View File

@@ -0,0 +1,34 @@
#!/bin/bash
# Cheatsheet for CentOS 7 packaging (not a build script)
set -e
rm -f /etc/yum.repos.d/CentOS-Media.repo
yum -y --enablerepo=extras install centos-release-scl epel-release
yum -y --enablerepo='*' install devtoolset-9-gcc-c++ devtoolset-9-libatomic-devel gperftools-devel
yumdownloader --source qemu
yumdownloader --source fio
yum-builddep -y --enablerepo='*' qemu
yum -y install rpm-build
. /opt/rh/devtoolset-9/enable
rpm --nomd5 -i qemu*.src.rpm
rpm --nomd5 -i fio*.src.rpm
cd ~/rpmbuild/SPECS
rpmbuild -bp fio.spec
perl -i -pe 's/^make V=1/exit 1; make V=1/' qemu.spec
rpmbuild -bc qemu.spec
perl -i -pe 's/^exit 1; make V=1/make V=1/' qemu.spec
cd ~/rpmbuild/BUILD/qemu*/
make qapi-types.h
mkdir -p ~/vitastor/qemu/b/qemu
cp config-host.h ~/vitastor/qemu/b/qemu
cp qapi-types.h ~/vitastor/qemu/b/qemu
cp -r include ~/vitastor/qemu
cd ~/vitastor
sh copy-qemu-includes.sh
mv qemu qemu-old
mv qemu-copy qemu
ln -s ~/rpmbuild/BUILD/fio*/ fio
sh copy-fio-includes.sh
rm fio
mv fio-copy fio

View File

@@ -12,4 +12,4 @@
#define PATTERN3 0x426bd7854eb08509 #define PATTERN3 0x426bd7854eb08509
#define set_pattern(buf, len, pattern) for (uint64_t i = 0; i < len; i += 8) { *(uint64_t*)((void*)buf + i) = pattern; } #define set_pattern(buf, len, pattern) for (uint64_t i = 0; i < len; i += 8) { *(uint64_t*)((void*)buf + i) = pattern; }
#define check_pattern(buf, len, pattern) for (uint64_t i = 0; i < len; i += 8) { assert(*(uint64_t*)(buf + i) == pattern); } #define check_pattern(buf, len, pattern) { uint64_t bad = UINT64_MAX; for (uint64_t i = 0; i < len; i += 8) { if ((*(uint64_t*)(buf + i)) != (pattern)) { bad = i; break; } } if (bad != UINT64_MAX) { printf("mismatch at %lx\n", bad); } assert(bad == UINT64_MAX); }

View File

@@ -30,7 +30,7 @@
#include "blockstore.h" #include "blockstore.h"
#include "blockstore_impl.h" #include "blockstore_impl.h"
#include "osd_peering_pg.h" #include "osd_peering_pg.cpp"
//#include "cpp-btree/btree_map.h" //#include "cpp-btree/btree_map.h"
static int setup_context(unsigned entries, struct io_uring *ring) static int setup_context(unsigned entries, struct io_uring *ring)
@@ -168,7 +168,7 @@ int main0(int argc, char *argv[])
}, },
.version = 1, .version = 1,
}] = (dirty_entry){ }] = (dirty_entry){
.state = ST_D_SYNCED, .state = BS_ST_SYNCED | BS_ST_BIG_WRITE,
.flags = 0, .flags = 0,
.location = (uint64_t)i << 17, .location = (uint64_t)i << 17,
.offset = 0, .offset = 0,