From d658f65c16901afd58a7ce88dcebfdefb3594924 Mon Sep 17 00:00:00 2001 From: "Longpeng(Mike)" Date: Thu, 6 Aug 2020 15:40:29 +0800 Subject: [PATCH 1/8] migration: unify the framework of socket-type channel Currently, the only difference of tcp channel and unix channel in migration/socket.c is the way to build SocketAddress, but socket_parse() can handle these two types, so use it to instead of tcp_build_address() and unix_build_address(). The socket-type channel can be further unified based on the up, this would be helpful for us to add other socket-type channels. Signed-off-by: Longpeng(Mike) Message-Id: <20200806074030.174-2-longpeng2@huawei.com> Reviewed-by: Dr. David Alan Gilbert Signed-off-by: Dr. David Alan Gilbert --- migration/migration.c | 18 +++++------ migration/socket.c | 72 +++++++++---------------------------------- migration/socket.h | 11 ++----- 3 files changed, 26 insertions(+), 75 deletions(-) diff --git a/migration/migration.c b/migration/migration.c index dbd4afa1e8..ac37a386dd 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -378,21 +378,20 @@ void migrate_add_address(SocketAddress *address) void qemu_start_incoming_migration(const char *uri, Error **errp) { - const char *p; + const char *p = NULL; qapi_event_send_migration(MIGRATION_STATUS_SETUP); if (!strcmp(uri, "defer")) { deferred_incoming_migration(errp); - } else if (strstart(uri, "tcp:", &p)) { - tcp_start_incoming_migration(p, errp); + } else if (strstart(uri, "tcp:", &p) || + strstart(uri, "unix:", NULL)) { + socket_start_incoming_migration(p ? p : uri, errp); #ifdef CONFIG_RDMA } else if (strstart(uri, "rdma:", &p)) { rdma_start_incoming_migration(p, errp); #endif } else if (strstart(uri, "exec:", &p)) { exec_start_incoming_migration(p, errp); - } else if (strstart(uri, "unix:", &p)) { - unix_start_incoming_migration(p, errp); } else if (strstart(uri, "fd:", &p)) { fd_start_incoming_migration(p, errp); } else { @@ -2094,7 +2093,7 @@ void qmp_migrate(const char *uri, bool has_blk, bool blk, { Error *local_err = NULL; MigrationState *s = migrate_get_current(); - const char *p; + const char *p = NULL; if (!migrate_prepare(s, has_blk && blk, has_inc && inc, has_resume && resume, errp)) { @@ -2102,16 +2101,15 @@ void qmp_migrate(const char *uri, bool has_blk, bool blk, return; } - if (strstart(uri, "tcp:", &p)) { - tcp_start_outgoing_migration(s, p, &local_err); + if (strstart(uri, "tcp:", &p) || + strstart(uri, "unix:", NULL)) { + socket_start_outgoing_migration(s, p ? p : uri, &local_err); #ifdef CONFIG_RDMA } else if (strstart(uri, "rdma:", &p)) { rdma_start_outgoing_migration(s, p, &local_err); #endif } else if (strstart(uri, "exec:", &p)) { exec_start_outgoing_migration(s, p, &local_err); - } else if (strstart(uri, "unix:", &p)) { - unix_start_outgoing_migration(s, p, &local_err); } else if (strstart(uri, "fd:", &p)) { fd_start_outgoing_migration(s, p, &local_err); } else { diff --git a/migration/socket.c b/migration/socket.c index 97c9efde59..6016642e04 100644 --- a/migration/socket.c +++ b/migration/socket.c @@ -50,34 +50,6 @@ int socket_send_channel_destroy(QIOChannel *send) return 0; } -static SocketAddress *tcp_build_address(const char *host_port, Error **errp) -{ - SocketAddress *saddr; - - saddr = g_new0(SocketAddress, 1); - saddr->type = SOCKET_ADDRESS_TYPE_INET; - - if (inet_parse(&saddr->u.inet, host_port, errp)) { - qapi_free_SocketAddress(saddr); - return NULL; - } - - return saddr; -} - - -static SocketAddress *unix_build_address(const char *path) -{ - SocketAddress *saddr; - - saddr = g_new0(SocketAddress, 1); - saddr->type = SOCKET_ADDRESS_TYPE_UNIX; - saddr->u.q_unix.path = g_strdup(path); - - return saddr; -} - - struct SocketConnectData { MigrationState *s; char *hostname; @@ -109,9 +81,10 @@ static void socket_outgoing_migration(QIOTask *task, object_unref(OBJECT(sioc)); } -static void socket_start_outgoing_migration(MigrationState *s, - SocketAddress *saddr, - Error **errp) +static void +socket_start_outgoing_migration_internal(MigrationState *s, + SocketAddress *saddr, + Error **errp) { QIOChannelSocket *sioc = qio_channel_socket_new(); struct SocketConnectData *data = g_new0(struct SocketConnectData, 1); @@ -135,27 +108,18 @@ static void socket_start_outgoing_migration(MigrationState *s, NULL); } -void tcp_start_outgoing_migration(MigrationState *s, - const char *host_port, - Error **errp) +void socket_start_outgoing_migration(MigrationState *s, + const char *str, + Error **errp) { Error *err = NULL; - SocketAddress *saddr = tcp_build_address(host_port, &err); + SocketAddress *saddr = socket_parse(str, &err); if (!err) { - socket_start_outgoing_migration(s, saddr, &err); + socket_start_outgoing_migration_internal(s, saddr, &err); } error_propagate(errp, err); } -void unix_start_outgoing_migration(MigrationState *s, - const char *path, - Error **errp) -{ - SocketAddress *saddr = unix_build_address(path); - socket_start_outgoing_migration(s, saddr, errp); -} - - static void socket_accept_incoming_migration(QIONetListener *listener, QIOChannelSocket *cioc, gpointer opaque) @@ -173,8 +137,9 @@ static void socket_accept_incoming_migration(QIONetListener *listener, } -static void socket_start_incoming_migration(SocketAddress *saddr, - Error **errp) +static void +socket_start_incoming_migration_internal(SocketAddress *saddr, + Error **errp) { QIONetListener *listener = qio_net_listener_new(); size_t i; @@ -207,20 +172,13 @@ static void socket_start_incoming_migration(SocketAddress *saddr, } } -void tcp_start_incoming_migration(const char *host_port, Error **errp) +void socket_start_incoming_migration(const char *str, Error **errp) { Error *err = NULL; - SocketAddress *saddr = tcp_build_address(host_port, &err); + SocketAddress *saddr = socket_parse(str, &err); if (!err) { - socket_start_incoming_migration(saddr, &err); + socket_start_incoming_migration_internal(saddr, &err); } qapi_free_SocketAddress(saddr); error_propagate(errp, err); } - -void unix_start_incoming_migration(const char *path, Error **errp) -{ - SocketAddress *saddr = unix_build_address(path); - socket_start_incoming_migration(saddr, errp); - qapi_free_SocketAddress(saddr); -} diff --git a/migration/socket.h b/migration/socket.h index 528c3b0202..891dbccceb 100644 --- a/migration/socket.h +++ b/migration/socket.h @@ -23,13 +23,8 @@ void socket_send_channel_create(QIOTaskFunc f, void *data); int socket_send_channel_destroy(QIOChannel *send); -void tcp_start_incoming_migration(const char *host_port, Error **errp); +void socket_start_incoming_migration(const char *str, Error **errp); -void tcp_start_outgoing_migration(MigrationState *s, const char *host_port, - Error **errp); - -void unix_start_incoming_migration(const char *path, Error **errp); - -void unix_start_outgoing_migration(MigrationState *s, const char *path, - Error **errp); +void socket_start_outgoing_migration(MigrationState *s, const char *str, + Error **errp); #endif From 9ba3b2baa1f06e772c84f0ab77146428a2b19db1 Mon Sep 17 00:00:00 2001 From: "Longpeng(Mike)" Date: Thu, 6 Aug 2020 15:40:30 +0800 Subject: [PATCH 2/8] migration: add vsock as data channel support The vsock channel is more widely use in some new features, for example, the Nitro/Enclave. It can also be used as the migration channel. Signed-off-by: Longpeng(Mike) Message-Id: <20200806074030.174-3-longpeng2@huawei.com> Reviewed-by: Dr. David Alan Gilbert Signed-off-by: Dr. David Alan Gilbert --- migration/migration.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/migration/migration.c b/migration/migration.c index ac37a386dd..58a5452471 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -384,7 +384,8 @@ void qemu_start_incoming_migration(const char *uri, Error **errp) if (!strcmp(uri, "defer")) { deferred_incoming_migration(errp); } else if (strstart(uri, "tcp:", &p) || - strstart(uri, "unix:", NULL)) { + strstart(uri, "unix:", NULL) || + strstart(uri, "vsock:", NULL)) { socket_start_incoming_migration(p ? p : uri, errp); #ifdef CONFIG_RDMA } else if (strstart(uri, "rdma:", &p)) { @@ -2102,7 +2103,8 @@ void qmp_migrate(const char *uri, bool has_blk, bool blk, } if (strstart(uri, "tcp:", &p) || - strstart(uri, "unix:", NULL)) { + strstart(uri, "unix:", NULL) || + strstart(uri, "vsock:", NULL)) { socket_start_outgoing_migration(s, p ? p : uri, &local_err); #ifdef CONFIG_RDMA } else if (strstart(uri, "rdma:", &p)) { From aa8a926d3c0af81a31f50f8d1d9688c6f2d67aa6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20P=2E=20Berrang=C3=A9?= Date: Thu, 27 Aug 2020 12:16:00 +0100 Subject: [PATCH 3/8] migration: improve error reporting of block driver state name MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit With blockdev, a BlockDriverState may not have a device name, so using a node name is required as an alternative. Reviewed-by: Dr. David Alan Gilbert Signed-off-by: Daniel P. Berrangé Message-Id: <20200827111606.1408275-2-berrange@redhat.com> Signed-off-by: Dr. David Alan Gilbert --- migration/savevm.c | 12 ++++++------ tests/qemu-iotests/267.out | 4 ++-- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/migration/savevm.c b/migration/savevm.c index a843d202b5..304d98ff78 100644 --- a/migration/savevm.c +++ b/migration/savevm.c @@ -2682,7 +2682,7 @@ int save_snapshot(const char *name, Error **errp) if (!bdrv_all_can_snapshot(&bs)) { error_setg(errp, "Device '%s' is writable but does not support " - "snapshots", bdrv_get_device_name(bs)); + "snapshots", bdrv_get_device_or_node_name(bs)); return ret; } @@ -2691,7 +2691,7 @@ int save_snapshot(const char *name, Error **errp) ret = bdrv_all_delete_snapshot(name, &bs1, errp); if (ret < 0) { error_prepend(errp, "Error while deleting snapshot on device " - "'%s': ", bdrv_get_device_name(bs1)); + "'%s': ", bdrv_get_device_or_node_name(bs1)); return ret; } } @@ -2766,7 +2766,7 @@ int save_snapshot(const char *name, Error **errp) ret = bdrv_all_create_snapshot(sn, bs, vm_state_size, &bs); if (ret < 0) { error_setg(errp, "Error while creating snapshot on '%s'", - bdrv_get_device_name(bs)); + bdrv_get_device_or_node_name(bs)); goto the_end; } @@ -2884,14 +2884,14 @@ int load_snapshot(const char *name, Error **errp) if (!bdrv_all_can_snapshot(&bs)) { error_setg(errp, "Device '%s' is writable but does not support snapshots", - bdrv_get_device_name(bs)); + bdrv_get_device_or_node_name(bs)); return -ENOTSUP; } ret = bdrv_all_find_snapshot(name, &bs); if (ret < 0) { error_setg(errp, "Device '%s' does not have the requested snapshot '%s'", - bdrv_get_device_name(bs), name); + bdrv_get_device_or_node_name(bs), name); return ret; } @@ -2920,7 +2920,7 @@ int load_snapshot(const char *name, Error **errp) ret = bdrv_all_goto_snapshot(name, &bs, errp); if (ret < 0) { error_prepend(errp, "Could not load snapshot '%s' on '%s': ", - name, bdrv_get_device_name(bs)); + name, bdrv_get_device_or_node_name(bs)); goto err_drain; } diff --git a/tests/qemu-iotests/267.out b/tests/qemu-iotests/267.out index d6d80c099f..215902b3ad 100644 --- a/tests/qemu-iotests/267.out +++ b/tests/qemu-iotests/267.out @@ -81,11 +81,11 @@ Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=134217728 Testing: -blockdev driver=file,filename=TEST_DIR/t.IMGFMT,node-name=file QEMU X.Y.Z monitor - type 'help' for more information (qemu) savevm snap0 -Error: Device '' is writable but does not support snapshots +Error: Device 'file' is writable but does not support snapshots (qemu) info snapshots No available block device supports snapshots (qemu) loadvm snap0 -Error: Device '' is writable but does not support snapshots +Error: Device 'file' is writable but does not support snapshots (qemu) quit Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=134217728 From a9e80a5f0cdd8e51c266ea0f943f8aafdd0afd13 Mon Sep 17 00:00:00 2001 From: Zhenyu Ye Date: Wed, 22 Jul 2020 11:32:28 +0800 Subject: [PATCH 4/8] migration: tls: fix memory leak in migration_tls_get_creds MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently migration_tls_get_creds() adds the reference of creds but there was no place to unref it. So the OBJECT(creds) will never be freed and result in memory leak. The leak stack: Direct leak of 104 byte(s) in 1 object(s) allocated from: #0 0xffffa88bd20b in __interceptor_malloc (/usr/lib64/libasan.so.4+0xd320b) #1 0xffffa7f0cb1b in g_malloc (/usr/lib64/libglib-2.0.so.0+0x58b1b) #2 0x14b58cb in object_new_with_type qom/object.c:634 #3 0x14b597b in object_new qom/object.c:645 #4 0x14c0e4f in user_creatable_add_type qom/object_interfaces.c:59 #5 0x141c78b in qmp_object_add qom/qom-qmp-cmds.c:312 #6 0x140e513 in qmp_marshal_object_add qapi/qapi-commands-qom.c:279 #7 0x176ba97 in do_qmp_dispatch qapi/qmp-dispatch.c:165 #8 0x176bee7 in qmp_dispatch qapi/qmp-dispatch.c:208 #9 0x136e337 in monitor_qmp_dispatch monitor/qmp.c:150 #10 0x136eae3 in monitor_qmp_bh_dispatcher monitor/qmp.c:239 #11 0x1852e93 in aio_bh_call util/async.c:89 #12 0x18531b7 in aio_bh_poll util/async.c:117 #13 0x18616bf in aio_dispatch util/aio-posix.c:459 #14 0x1853f37 in aio_ctx_dispatch util/async.c:268 #15 0xffffa7f06a7b in g_main_context_dispatch (/usr/lib64/libglib-2.0.so.0+0x52a7b) Since we're fine to use the borrowed reference when using the creds, so just remove the object_ref() in migration_tls_get_creds(). Signed-off-by: Zhenyu Ye Message-Id: <20200722033228.71-1-yezhenyu2@huawei.com> Reviewed-by: Daniel P. Berrangé Signed-off-by: Dr. David Alan Gilbert --- migration/tls.c | 1 - 1 file changed, 1 deletion(-) diff --git a/migration/tls.c b/migration/tls.c index 5171afc6c4..7a02ec8656 100644 --- a/migration/tls.c +++ b/migration/tls.c @@ -58,7 +58,6 @@ migration_tls_get_creds(MigrationState *s, return NULL; } - object_ref(OBJECT(ret)); return ret; } From 88fc107956a5812649e5918e0c092d3f78bb28ad Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Mon, 27 Jul 2020 12:18:41 -0400 Subject: [PATCH 5/8] virtiofsd: Disable remote posix locks by default Right now we enable remote posix locks by default. That means when guest does a posix lock it sends request to server (virtiofsd). But currently we only support non-blocking posix lock and return -EOPNOTSUPP for blocking version. This means that existing applications which are doing blocking posix locks get -EOPNOTSUPP and fail. To avoid this, people have been running virtiosd with option "-o no_posix_lock". For new users it is still a surprise and trial and error takes them to this option. Given posix lock implementation is not complete in virtiofsd, disable it by default. This means that posix locks will work with-in applications in a guest but not across guests. Anyway we don't support sharing filesystem among different guests yet in virtiofs so this should not lead to any kind of surprise or regression and will make life little easier for virtiofs users. Reported-by: Aa Aa Suggested-by: Miklos Szeredi Signed-off-by: Vivek Goyal Reviewed-by: Stefan Hajnoczi Reviewed-by: Misono Tomohiro Signed-off-by: Dr. David Alan Gilbert --- docs/tools/virtiofsd.rst | 2 +- tools/virtiofsd/passthrough_ll.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/tools/virtiofsd.rst b/docs/tools/virtiofsd.rst index 824e713491..2cfdfd9ba2 100644 --- a/docs/tools/virtiofsd.rst +++ b/docs/tools/virtiofsd.rst @@ -67,7 +67,7 @@ Options Disable racy fallback. The default is false. * posix_lock|no_posix_lock - - Enable/disable remote POSIX locks. The default is ``posix_lock``. + Enable/disable remote POSIX locks. The default is ``no_posix_lock``. * readdirplus|no_readdirplus - Enable/disable readdirplus. The default is ``readdirplus``. diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c index 63d1d00565..a9feb90fd0 100644 --- a/tools/virtiofsd/passthrough_ll.c +++ b/tools/virtiofsd/passthrough_ll.c @@ -2823,7 +2823,7 @@ int main(int argc, char *argv[]) struct lo_data lo = { .debug = 0, .writeback = 0, - .posix_lock = 1, + .posix_lock = 0, .proc_self_fd = -1, }; struct lo_map_elem *root_elem; From e9a78564a12c5cb670c4cbd5398a1ea8cd9ae642 Mon Sep 17 00:00:00 2001 From: Sergio Lopez Date: Fri, 17 Jul 2020 14:11:10 +0200 Subject: [PATCH 6/8] virtiofsd: Remove "norace" from cmdline help and docs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 93bb3d8d4cda ("virtiofsd: remove symlink fallbacks") removed the implementation of the "norace" option, so remove it from the cmdline help and the documentation too. Signed-off-by: Sergio Lopez Reviewed-by: Philippe Mathieu-Daudé Reviewed-by: Stefano Garzarella Message-Id: <20200717121110.50580-1-slp@redhat.com> Signed-off-by: Dr. David Alan Gilbert --- docs/tools/virtiofsd.rst | 3 --- tools/virtiofsd/helper.c | 2 -- 2 files changed, 5 deletions(-) diff --git a/docs/tools/virtiofsd.rst b/docs/tools/virtiofsd.rst index 2cfdfd9ba2..e33c81ed41 100644 --- a/docs/tools/virtiofsd.rst +++ b/docs/tools/virtiofsd.rst @@ -63,9 +63,6 @@ Options Print only log messages matching LEVEL or more severe. LEVEL is one of ``err``, ``warn``, ``info``, or ``debug``. The default is ``info``. - * norace - - Disable racy fallback. The default is false. - * posix_lock|no_posix_lock - Enable/disable remote POSIX locks. The default is ``no_posix_lock``. diff --git a/tools/virtiofsd/helper.c b/tools/virtiofsd/helper.c index 3105b6c23a..7bc5d7dc5a 100644 --- a/tools/virtiofsd/helper.c +++ b/tools/virtiofsd/helper.c @@ -159,8 +159,6 @@ void fuse_cmdline_help(void) " -o max_idle_threads the maximum number of idle worker " "threads\n" " allowed (default: 10)\n" - " -o norace disable racy fallback\n" - " default: false\n" " -o posix_lock|no_posix_lock\n" " enable/disable remote posix lock\n" " default: posix_lock\n" From 1c7cb1f52e2577e190c09c9a14e6b6f56f4a3ec3 Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Mon, 27 Jul 2020 20:02:21 +0100 Subject: [PATCH 7/8] virtiofsd: drop CAP_DAC_READ_SEARCH virtiofsd does not need CAP_DAC_READ_SEARCH because it already has the more powerful CAP_DAC_OVERRIDE. Drop it from the list of capabilities. This is important because container runtimes may not include CAP_DAC_READ_SEARCH by default. This patch allows virtiofsd to reduce its capabilities when running inside a Docker container. Note that CAP_DAC_READ_SEARCH may be necessary again in the future if virtiofsd starts using open_by_handle_at(2). Signed-off-by: Stefan Hajnoczi Reviewed-by: Dr. David Alan Gilbert Message-Id: <20200727190223.422280-2-stefanha@redhat.com> Signed-off-by: Dr. David Alan Gilbert --- tools/virtiofsd/passthrough_ll.c | 1 - 1 file changed, 1 deletion(-) diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c index a9feb90fd0..784330e0e4 100644 --- a/tools/virtiofsd/passthrough_ll.c +++ b/tools/virtiofsd/passthrough_ll.c @@ -2596,7 +2596,6 @@ static void setup_capabilities(char *modcaps_in) if (capng_updatev(CAPNG_ADD, CAPNG_PERMITTED | CAPNG_EFFECTIVE, CAP_CHOWN, CAP_DAC_OVERRIDE, - CAP_DAC_READ_SEARCH, CAP_FOWNER, CAP_FSETID, CAP_SETGID, From fd9279ec9985d9c8a0b533eff24839f93695b0f4 Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Mon, 27 Jul 2020 20:02:23 +0100 Subject: [PATCH 8/8] virtiofsd: probe unshare(CLONE_FS) and print an error An assertion failure is raised during request processing if unshare(CLONE_FS) fails. Implement a probe at startup so the problem can be detected right away. Unfortunately Docker/Moby does not include unshare in the seccomp.json list unless CAP_SYS_ADMIN is given. Other seccomp.json lists always include unshare (e.g. podman is unaffected): https://raw.githubusercontent.com/seccomp/containers-golang/master/seccomp.json Use "docker run --security-opt seccomp=path/to/seccomp.json ..." if the default seccomp.json is missing unshare. Cc: Misono Tomohiro Signed-off-by: Stefan Hajnoczi Message-Id: <20200727190223.422280-4-stefanha@redhat.com> Reviewed-by: Misono Tomohiro Signed-off-by: Dr. David Alan Gilbert --- tools/virtiofsd/fuse_virtio.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/tools/virtiofsd/fuse_virtio.c b/tools/virtiofsd/fuse_virtio.c index 3b6d16a041..9e5537506c 100644 --- a/tools/virtiofsd/fuse_virtio.c +++ b/tools/virtiofsd/fuse_virtio.c @@ -949,6 +949,22 @@ int virtio_session_mount(struct fuse_session *se) { int ret; + /* + * Test that unshare(CLONE_FS) works. fv_queue_worker() will need it. It's + * an unprivileged system call but some Docker/Moby versions are known to + * reject it via seccomp when CAP_SYS_ADMIN is not given. + * + * Note that the program is single-threaded here so this syscall has no + * visible effect and is safe to make. + */ + ret = unshare(CLONE_FS); + if (ret == -1 && errno == EPERM) { + fuse_log(FUSE_LOG_ERR, "unshare(CLONE_FS) failed with EPERM. If " + "running in a container please check that the container " + "runtime seccomp policy allows unshare.\n"); + return -1; + } + ret = fv_create_listen_socket(se); if (ret < 0) { return ret;