From ea8f942fe46dd10e0946f02ab9d698fb41e958f7 Mon Sep 17 00:00:00 2001 From: Markus Armbruster Date: Wed, 20 Jul 2011 18:23:35 +0200 Subject: [PATCH 01/29] blockdev: Make eject fail for non-removable drives even with -f Ejecting hard disk platters can only end in tears. If you need to revoke access to an image, use drive_del, not eject -f. Signed-off-by: Markus Armbruster Signed-off-by: Kevin Wolf --- blockdev.c | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/blockdev.c b/blockdev.c index 0b8d3a4f83..a25367a9e3 100644 --- a/blockdev.c +++ b/blockdev.c @@ -646,16 +646,13 @@ out: static int eject_device(Monitor *mon, BlockDriverState *bs, int force) { - if (!force) { - if (!bdrv_is_removable(bs)) { - qerror_report(QERR_DEVICE_NOT_REMOVABLE, - bdrv_get_device_name(bs)); - return -1; - } - if (bdrv_is_locked(bs)) { - qerror_report(QERR_DEVICE_LOCKED, bdrv_get_device_name(bs)); - return -1; - } + if (!bdrv_is_removable(bs)) { + qerror_report(QERR_DEVICE_NOT_REMOVABLE, bdrv_get_device_name(bs)); + return -1; + } + if (!force && bdrv_is_locked(bs)) { + qerror_report(QERR_DEVICE_LOCKED, bdrv_get_device_name(bs)); + return -1; } bdrv_close(bs); return 0; From a19712b0dbe43016fb17ec48bfff2f360225fe97 Mon Sep 17 00:00:00 2001 From: Markus Armbruster Date: Wed, 20 Jul 2011 18:23:36 +0200 Subject: [PATCH 02/29] block: Reset device model callbacks on detach BlockDriverState members change_cb and change_opaque are initially null. The device model may set them, with bdrv_set_change_cb(). If the device model gets detached (hot unplug), they're left dangling. Only safe because device hot unplug automatically destroys the BlockDriverState. But that's a questionable feature, best not to rely on it. Signed-off-by: Markus Armbruster Signed-off-by: Kevin Wolf --- block.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/block.c b/block.c index 9549b9eff9..81a82578a2 100644 --- a/block.c +++ b/block.c @@ -730,6 +730,8 @@ void bdrv_detach(BlockDriverState *bs, DeviceState *qdev) { assert(bs->peer == qdev); bs->peer = NULL; + bs->change_cb = NULL; + bs->change_opaque = NULL; } DeviceState *bdrv_get_attached(BlockDriverState *bs) From 02266d547a6c7b10e1ac1574ec69b92f4e28f817 Mon Sep 17 00:00:00 2001 From: Markus Armbruster Date: Wed, 20 Jul 2011 18:23:40 +0200 Subject: [PATCH 03/29] block/raw-win32: Drop disabled code for removable host devices It's been disabled since the start (commit 19cb3738, Aug 2006), and has been untouched except for spelling fixes and such. I don't feel like dragging it along any further. Signed-off-by: Markus Armbruster Signed-off-by: Kevin Wolf --- block/raw-win32.c | 35 ----------------------------------- 1 file changed, 35 deletions(-) diff --git a/block/raw-win32.c b/block/raw-win32.c index 91067e7595..e47cfe0f4a 100644 --- a/block/raw-win32.c +++ b/block/raw-win32.c @@ -393,41 +393,6 @@ static int hdev_open(BlockDriverState *bs, const char *filename, int flags) return 0; } -#if 0 -/***********************************************/ -/* removable device additional commands */ - -static int raw_is_inserted(BlockDriverState *bs) -{ - return 1; -} - -static int raw_media_changed(BlockDriverState *bs) -{ - return -ENOTSUP; -} - -static int raw_eject(BlockDriverState *bs, int eject_flag) -{ - DWORD ret_count; - - if (s->type == FTYPE_FILE) - return -ENOTSUP; - if (eject_flag) { - DeviceIoControl(s->hfile, IOCTL_STORAGE_EJECT_MEDIA, - NULL, 0, NULL, 0, &lpBytesReturned, NULL); - } else { - DeviceIoControl(s->hfile, IOCTL_STORAGE_LOAD_MEDIA, - NULL, 0, NULL, 0, &lpBytesReturned, NULL); - } -} - -static int raw_set_locked(BlockDriverState *bs, int locked) -{ - return -ENOTSUP; -} -#endif - static int hdev_has_zero_init(BlockDriverState *bs) { return 0; From 7bf37feddcfa527304cfdc02bd2db8912ee9bf8c Mon Sep 17 00:00:00 2001 From: Markus Armbruster Date: Wed, 20 Jul 2011 18:23:41 +0200 Subject: [PATCH 04/29] block: Make BlockDriver method bdrv_set_locked() return void The only caller is bdrv_set_locked(), and it ignores the value. Callees always return 0, except for FreeBSD's cdrom_set_locked(), which returns -ENOTSUP when the device is in a terminally wedged state. Signed-off-by: Markus Armbruster Signed-off-by: Kevin Wolf --- block/raw-posix.c | 10 +++------- block/raw.c | 3 +-- block_int.h | 2 +- 3 files changed, 5 insertions(+), 10 deletions(-) diff --git a/block/raw-posix.c b/block/raw-posix.c index cd89c8312a..5241308d23 100644 --- a/block/raw-posix.c +++ b/block/raw-posix.c @@ -1363,7 +1363,7 @@ static int cdrom_eject(BlockDriverState *bs, int eject_flag) return 0; } -static int cdrom_set_locked(BlockDriverState *bs, int locked) +static void cdrom_set_locked(BlockDriverState *bs, int locked) { BDRVRawState *s = bs->opaque; @@ -1374,8 +1374,6 @@ static int cdrom_set_locked(BlockDriverState *bs, int locked) */ /* perror("CDROM_LOCKDOOR"); */ } - - return 0; } static BlockDriver bdrv_host_cdrom = { @@ -1486,12 +1484,12 @@ static int cdrom_eject(BlockDriverState *bs, int eject_flag) return 0; } -static int cdrom_set_locked(BlockDriverState *bs, int locked) +static void cdrom_set_locked(BlockDriverState *bs, int locked) { BDRVRawState *s = bs->opaque; if (s->fd < 0) - return -ENOTSUP; + return; if (ioctl(s->fd, (locked ? CDIOCPREVENT : CDIOCALLOW)) < 0) { /* * Note: an error can happen if the distribution automatically @@ -1499,8 +1497,6 @@ static int cdrom_set_locked(BlockDriverState *bs, int locked) */ /* perror("CDROM_LOCKDOOR"); */ } - - return 0; } static BlockDriver bdrv_host_cdrom = { diff --git a/block/raw.c b/block/raw.c index b0f72d6a62..1398a9c221 100644 --- a/block/raw.c +++ b/block/raw.c @@ -80,10 +80,9 @@ static int raw_eject(BlockDriverState *bs, int eject_flag) return bdrv_eject(bs->file, eject_flag); } -static int raw_set_locked(BlockDriverState *bs, int locked) +static void raw_set_locked(BlockDriverState *bs, int locked) { bdrv_set_locked(bs->file, locked); - return 0; } static int raw_ioctl(BlockDriverState *bs, unsigned long int req, void *buf) diff --git a/block_int.h b/block_int.h index efb68038c4..e0b638c116 100644 --- a/block_int.h +++ b/block_int.h @@ -113,7 +113,7 @@ struct BlockDriver { int (*bdrv_is_inserted)(BlockDriverState *bs); int (*bdrv_media_changed)(BlockDriverState *bs); int (*bdrv_eject)(BlockDriverState *bs, int eject_flag); - int (*bdrv_set_locked)(BlockDriverState *bs, int locked); + void (*bdrv_set_locked)(BlockDriverState *bs, int locked); /* to control generic scsi devices */ int (*bdrv_ioctl)(BlockDriverState *bs, unsigned long int req, void *buf); From 822e1cd17e8fa3ae98d0481c20f042316ace3fbc Mon Sep 17 00:00:00 2001 From: Markus Armbruster Date: Wed, 20 Jul 2011 18:23:42 +0200 Subject: [PATCH 05/29] block: Make BlockDriver method bdrv_eject() return void Callees always return 0, except for FreeBSD's cdrom_eject(), which returns -ENOTSUP when the device is in a terminally wedged state. The only caller is bdrv_eject(), and it maps -ENOTSUP to 0 since commit 4be9762a. Signed-off-by: Markus Armbruster Signed-off-by: Kevin Wolf --- block.c | 17 ++++------------- block/raw-posix.c | 16 +++++----------- block/raw.c | 4 ++-- block_int.h | 2 +- 4 files changed, 12 insertions(+), 27 deletions(-) diff --git a/block.c b/block.c index 81a82578a2..7c25fe4990 100644 --- a/block.c +++ b/block.c @@ -2770,25 +2770,16 @@ int bdrv_media_changed(BlockDriverState *bs) int bdrv_eject(BlockDriverState *bs, int eject_flag) { BlockDriver *drv = bs->drv; - int ret; if (bs->locked) { return -EBUSY; } - if (!drv || !drv->bdrv_eject) { - ret = -ENOTSUP; - } else { - ret = drv->bdrv_eject(bs, eject_flag); + if (drv && drv->bdrv_eject) { + drv->bdrv_eject(bs, eject_flag); } - if (ret == -ENOTSUP) { - ret = 0; - } - if (ret >= 0) { - bs->tray_open = eject_flag; - } - - return ret; + bs->tray_open = eject_flag; + return 0; } int bdrv_is_locked(BlockDriverState *bs) diff --git a/block/raw-posix.c b/block/raw-posix.c index 5241308d23..6672d31da3 100644 --- a/block/raw-posix.c +++ b/block/raw-posix.c @@ -1254,7 +1254,7 @@ static int floppy_media_changed(BlockDriverState *bs) return ret; } -static int floppy_eject(BlockDriverState *bs, int eject_flag) +static void floppy_eject(BlockDriverState *bs, int eject_flag) { BDRVRawState *s = bs->opaque; int fd; @@ -1269,8 +1269,6 @@ static int floppy_eject(BlockDriverState *bs, int eject_flag) perror("FDEJECT"); close(fd); } - - return 0; } static BlockDriver bdrv_host_floppy = { @@ -1348,7 +1346,7 @@ static int cdrom_is_inserted(BlockDriverState *bs) return 0; } -static int cdrom_eject(BlockDriverState *bs, int eject_flag) +static void cdrom_eject(BlockDriverState *bs, int eject_flag) { BDRVRawState *s = bs->opaque; @@ -1359,8 +1357,6 @@ static int cdrom_eject(BlockDriverState *bs, int eject_flag) if (ioctl(s->fd, CDROMCLOSETRAY, NULL) < 0) perror("CDROMEJECT"); } - - return 0; } static void cdrom_set_locked(BlockDriverState *bs, int locked) @@ -1462,12 +1458,12 @@ static int cdrom_is_inserted(BlockDriverState *bs) return raw_getlength(bs) > 0; } -static int cdrom_eject(BlockDriverState *bs, int eject_flag) +static void cdrom_eject(BlockDriverState *bs, int eject_flag) { BDRVRawState *s = bs->opaque; if (s->fd < 0) - return -ENOTSUP; + return; (void) ioctl(s->fd, CDIOCALLOW); @@ -1479,9 +1475,7 @@ static int cdrom_eject(BlockDriverState *bs, int eject_flag) perror("CDIOCCLOSE"); } - if (cdrom_reopen(bs) < 0) - return -ENOTSUP; - return 0; + cdrom_reopen(bs); } static void cdrom_set_locked(BlockDriverState *bs, int locked) diff --git a/block/raw.c b/block/raw.c index 1398a9c221..cb6203eeca 100644 --- a/block/raw.c +++ b/block/raw.c @@ -75,9 +75,9 @@ static int raw_is_inserted(BlockDriverState *bs) return bdrv_is_inserted(bs->file); } -static int raw_eject(BlockDriverState *bs, int eject_flag) +static void raw_eject(BlockDriverState *bs, int eject_flag) { - return bdrv_eject(bs->file, eject_flag); + bdrv_eject(bs->file, eject_flag); } static void raw_set_locked(BlockDriverState *bs, int locked) diff --git a/block_int.h b/block_int.h index e0b638c116..efefbee289 100644 --- a/block_int.h +++ b/block_int.h @@ -112,7 +112,7 @@ struct BlockDriver { /* removable device specific */ int (*bdrv_is_inserted)(BlockDriverState *bs); int (*bdrv_media_changed)(BlockDriverState *bs); - int (*bdrv_eject)(BlockDriverState *bs, int eject_flag); + void (*bdrv_eject)(BlockDriverState *bs, int eject_flag); void (*bdrv_set_locked)(BlockDriverState *bs, int locked); /* to control generic scsi devices */ From 49aa46bb4b894ff8bdb0339ee2a5dd3fcfe93ecd Mon Sep 17 00:00:00 2001 From: Markus Armbruster Date: Wed, 20 Jul 2011 18:23:43 +0200 Subject: [PATCH 06/29] block: Don't let locked flag prevent medium load Commit aea2a33c made bdrv_eject() obey the locked flag. Correct for medium eject (eject_flag set), incorrect for medium load (eject_flag clear). See MMC-5 Table 341 "Actions for Lock/Unlock/Eject". Signed-off-by: Markus Armbruster Signed-off-by: Kevin Wolf --- block.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block.c b/block.c index 7c25fe4990..8859f9b414 100644 --- a/block.c +++ b/block.c @@ -2771,7 +2771,7 @@ int bdrv_eject(BlockDriverState *bs, int eject_flag) { BlockDriver *drv = bs->drv; - if (bs->locked) { + if (eject_flag && bs->locked) { return -EBUSY; } From efc8243d00ab4cf4fa05a9be93233cb883b7caa0 Mon Sep 17 00:00:00 2001 From: "Serge E. Hallyn" Date: Mon, 25 Jul 2011 18:34:35 +0000 Subject: [PATCH 07/29] block/vpc.c: Detect too-large vpc file VHD files technically can be up to 2Tb, but virtual pc is limited to 127G. Currently qemu-img refused to create vpc files > 127G, but it is failing to return error when converting from a non-vpc VHD file which is >127G. It returns success, but creates a truncated converted image. Also, qemu-img info claims the vpc file is 127G (and clean). This patch detects a too-large vpc file and returns -EFBIG. Without this patch, ============================================================= root@ip-10-38-123-242:~/qemu-fixed# qemu-img info /mnt/140g-dynamic.vhd image: /mnt/140g-dynamic.vhd file format: vpc virtual size: 127G (136899993600 bytes) disk size: 284K root@ip-10-38-123-242:~/qemu-fixed# qemu-img convert -f vpc -O raw /mnt/140g-dynamic.vhd /mnt/y root@ip-10-38-123-242:~/qemu-fixed# echo $? 0 root@ip-10-38-123-242:~/qemu-fixed# qemu-img info /mnt/y image: /mnt/y file format: raw virtual size: 127G (136899993600 bytes) disk size: 0 ============================================================= (The 140G image was truncated with no warning or error.) With the patch, I get: ============================================================= root@ip-10-38-123-242:~/qemu-fixed# ./qemu-img info /mnt/140g-dynamic.vhd qemu-img: Could not open '/mnt/140g-dynamic.vhd': File too large root@ip-10-38-123-242:~/qemu-fixed# ./qemu-img convert -f vpc -O raw /mnt/140g-dynamic.vhd /mnt/y qemu-img: Could not open '/mnt/140g-dynamic.vhd': File too large qemu-img: Could not open '/mnt/140g-dynamic.vhd' ============================================================= See https://bugs.launchpad.net/qemu/+bug/814222 for details. Signed-off-by: Serge Hallyn Signed-off-by: Kevin Wolf --- block/vpc.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/block/vpc.c b/block/vpc.c index 56865da5bc..fdd5236892 100644 --- a/block/vpc.c +++ b/block/vpc.c @@ -156,6 +156,7 @@ static int vpc_open(BlockDriverState *bs, int flags) struct vhd_dyndisk_header* dyndisk_header; uint8_t buf[HEADER_SIZE]; uint32_t checksum; + int err = -1; if (bdrv_pread(bs->file, 0, s->footer_buf, HEADER_SIZE) != HEADER_SIZE) goto fail; @@ -176,6 +177,11 @@ static int vpc_open(BlockDriverState *bs, int flags) bs->total_sectors = (int64_t) be16_to_cpu(footer->cyls) * footer->heads * footer->secs_per_cyl; + if (bs->total_sectors >= 65535 * 16 * 255) { + err = -EFBIG; + goto fail; + } + if (bdrv_pread(bs->file, be64_to_cpu(footer->data_offset), buf, HEADER_SIZE) != HEADER_SIZE) goto fail; @@ -222,7 +228,7 @@ static int vpc_open(BlockDriverState *bs, int flags) return 0; fail: - return -1; + return err; } /* From 5f71d32f0da4d1e578738f765b57fbfaf4bd3214 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Fri, 22 Jul 2011 16:51:12 +0200 Subject: [PATCH 08/29] scsi-disk: Codingstyle fixes Replace tabs with spaces. Signed-off-by: Hannes Reinecke Signed-off-by: Kevin Wolf --- hw/scsi-disk.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/hw/scsi-disk.c b/hw/scsi-disk.c index f42a5d1f85..715f2cdec4 100644 --- a/hw/scsi-disk.c +++ b/hw/scsi-disk.c @@ -526,7 +526,7 @@ static int scsi_disk_emulate_inquiry(SCSIRequest *req, uint8_t *outbuf) memset(outbuf, 0, buflen); if (req->lun) { - outbuf[0] = 0x7f; /* LUN not supported */ + outbuf[0] = 0x7f; /* LUN not supported */ return buflen; } @@ -836,7 +836,7 @@ static int scsi_disk_emulate_command(SCSIDiskReq *r, uint8_t *outbuf) case TEST_UNIT_READY: if (!bdrv_is_inserted(s->bs)) goto not_ready; - break; + break; case REQUEST_SENSE: if (req->cmd.xfer < 4) goto illegal_request; @@ -848,7 +848,7 @@ static int scsi_disk_emulate_command(SCSIDiskReq *r, uint8_t *outbuf) buflen = scsi_disk_emulate_inquiry(req, outbuf); if (buflen < 0) goto illegal_request; - break; + break; case MODE_SENSE: case MODE_SENSE_10: buflen = scsi_disk_emulate_mode_sense(req, outbuf); @@ -881,14 +881,14 @@ static int scsi_disk_emulate_command(SCSIDiskReq *r, uint8_t *outbuf) /* load/eject medium */ bdrv_eject(s->bs, !(req->cmd.buf[4] & 1)); } - break; + break; case ALLOW_MEDIUM_REMOVAL: bdrv_set_locked(s->bs, req->cmd.buf[4] & 1); - break; + break; case READ_CAPACITY: /* The normal LEN field for this command is zero. */ - memset(outbuf, 0, 8); - bdrv_get_geometry(s->bs, &nb_sectors); + memset(outbuf, 0, 8); + bdrv_get_geometry(s->bs, &nb_sectors); if (!nb_sectors) goto not_ready; nb_sectors /= s->cluster_size; @@ -908,7 +908,7 @@ static int scsi_disk_emulate_command(SCSIDiskReq *r, uint8_t *outbuf) outbuf[6] = s->cluster_size * 2; outbuf[7] = 0; buflen = 8; - break; + break; case SYNCHRONIZE_CACHE: ret = bdrv_flush(s->bs); if (ret < 0) { From 3790372c963dbc87d4efdf24f8b718c283798fa0 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Fri, 22 Jul 2011 16:51:13 +0200 Subject: [PATCH 09/29] scsi: Remove references to SET_WINDOW SET_WINDOW command is vendor-specific only. So we shouldn't try to emulate it. Signed-off-by: Hannes Reinecke Signed-off-by: Kevin Wolf --- hw/scsi-bus.c | 2 -- hw/scsi-defs.h | 1 - 2 files changed, 3 deletions(-) diff --git a/hw/scsi-bus.c b/hw/scsi-bus.c index 8b1a412210..facc98d527 100644 --- a/hw/scsi-bus.c +++ b/hw/scsi-bus.c @@ -350,7 +350,6 @@ static void scsi_req_xfer_mode(SCSIRequest *req) case SEARCH_HIGH_12: case SEARCH_EQUAL_12: case SEARCH_LOW_12: - case SET_WINDOW: case MEDIUM_SCAN: case SEND_VOLUME_TAG: case WRITE_LONG_2: @@ -544,7 +543,6 @@ static const char *scsi_command_name(uint8_t cmd) [ SEND_DIAGNOSTIC ] = "SEND_DIAGNOSTIC", [ ALLOW_MEDIUM_REMOVAL ] = "ALLOW_MEDIUM_REMOVAL", - [ SET_WINDOW ] = "SET_WINDOW", [ READ_CAPACITY ] = "READ_CAPACITY", [ READ_10 ] = "READ_10", [ WRITE_10 ] = "WRITE_10", diff --git a/hw/scsi-defs.h b/hw/scsi-defs.h index 413cce07b5..8513983e34 100644 --- a/hw/scsi-defs.h +++ b/hw/scsi-defs.h @@ -49,7 +49,6 @@ #define SEND_DIAGNOSTIC 0x1d #define ALLOW_MEDIUM_REMOVAL 0x1e -#define SET_WINDOW 0x24 #define READ_CAPACITY 0x25 #define READ_10 0x28 #define WRITE_10 0x2a From 8bd3e139c638d9742e12da33007a19c5204302af Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Fri, 22 Jul 2011 16:51:14 +0200 Subject: [PATCH 10/29] scsi: Remove REZERO_UNIT emulation REZERO_UNIT command is obsolete. Remove support for it. Signed-off-by: Hannes Reinecke Signed-off-by: Kevin Wolf --- hw/scsi-bus.c | 3 --- hw/scsi-defs.h | 1 - hw/scsi-disk.c | 7 ------- 3 files changed, 11 deletions(-) diff --git a/hw/scsi-bus.c b/hw/scsi-bus.c index facc98d527..52a67846e7 100644 --- a/hw/scsi-bus.c +++ b/hw/scsi-bus.c @@ -223,7 +223,6 @@ static int scsi_req_length(SCSIRequest *req, uint8_t *cmd) switch(cmd[0]) { case TEST_UNIT_READY: - case REZERO_UNIT: case START_STOP: case SEEK_6: case WRITE_FILEMARKS: @@ -516,8 +515,6 @@ static const char *scsi_command_name(uint8_t cmd) { static const char *names[] = { [ TEST_UNIT_READY ] = "TEST_UNIT_READY", - [ REZERO_UNIT ] = "REZERO_UNIT", - /* REWIND and REZERO_UNIT use the same operation code */ [ REQUEST_SENSE ] = "REQUEST_SENSE", [ FORMAT_UNIT ] = "FORMAT_UNIT", [ READ_BLOCK_LIMITS ] = "READ_BLOCK_LIMITS", diff --git a/hw/scsi-defs.h b/hw/scsi-defs.h index 8513983e34..1f40c5c8a5 100644 --- a/hw/scsi-defs.h +++ b/hw/scsi-defs.h @@ -25,7 +25,6 @@ */ #define TEST_UNIT_READY 0x00 -#define REZERO_UNIT 0x01 #define REQUEST_SENSE 0x03 #define FORMAT_UNIT 0x04 #define READ_BLOCK_LIMITS 0x05 diff --git a/hw/scsi-disk.c b/hw/scsi-disk.c index 715f2cdec4..abf0bd21ec 100644 --- a/hw/scsi-disk.c +++ b/hw/scsi-disk.c @@ -972,12 +972,6 @@ static int scsi_disk_emulate_command(SCSIDiskReq *r, uint8_t *outbuf) break; case VERIFY: break; - case REZERO_UNIT: - DPRINTF("Rezero Unit\n"); - if (!bdrv_is_inserted(s->bs)) { - goto not_ready; - } - break; default: scsi_command_complete(r, CHECK_CONDITION, SENSE_CODE(INVALID_OPCODE)); return -1; @@ -1059,7 +1053,6 @@ static int32_t scsi_send_command(SCSIRequest *req, uint8_t *buf) case SERVICE_ACTION_IN: case REPORT_LUNS: case VERIFY: - case REZERO_UNIT: rc = scsi_disk_emulate_command(r, outbuf); if (rc < 0) { return 0; From 5e30a07d6d70d3073ff61e6db79d61c2b688502f Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Fri, 22 Jul 2011 16:51:15 +0200 Subject: [PATCH 11/29] scsi: Sanitize command definitions Sanitize SCSI command definitions. Add _10 suffix to READ_CAPACITY, WRITE_VERIFY, VERIFY, READ_LONG, WRITE_LONG, and WRITE_SAME. Add new command definitions for LOCATE_10, UNMAP, VARLENGTH_CDB, WRITE_FILEMARKS_16, EXTENDED_COPY, ATA_PASSTHROUGH, ACCESS_CONTROL_IN, ACCESS_CONTROL_OUT, COMPARE_AND_WRITE, VERIFY_16, SYNCHRONIZE_CACHE_16, LOCATE_16, ERASE_16, WRITE_LONG_16, LOAD_UNLOAD, VERIFY_12. Remove invalid definition of WRITE_LONG_2. Signed-off-by: Hannes Reinecke Signed-off-by: Kevin Wolf --- hw/scsi-bus.c | 69 ++++++++++++++++++++++++++++------------------- hw/scsi-defs.h | 54 ++++++++++++++++++++++--------------- hw/scsi-disk.c | 10 +++---- hw/scsi-generic.c | 2 +- 4 files changed, 81 insertions(+), 54 deletions(-) diff --git a/hw/scsi-bus.c b/hw/scsi-bus.c index 52a67846e7..0b0344c1fd 100644 --- a/hw/scsi-bus.c +++ b/hw/scsi-bus.c @@ -223,6 +223,7 @@ static int scsi_req_length(SCSIRequest *req, uint8_t *cmd) switch(cmd[0]) { case TEST_UNIT_READY: + case REWIND: case START_STOP: case SEEK_6: case WRITE_FILEMARKS: @@ -231,24 +232,24 @@ static int scsi_req_length(SCSIRequest *req, uint8_t *cmd) case RELEASE: case ERASE: case ALLOW_MEDIUM_REMOVAL: - case VERIFY: + case VERIFY_10: case SEEK_10: case SYNCHRONIZE_CACHE: case LOCK_UNLOCK_CACHE: case LOAD_UNLOAD: case SET_CD_SPEED: case SET_LIMITS: - case WRITE_LONG: + case WRITE_LONG_10: case MOVE_MEDIUM: case UPDATE_BLOCK: req->cmd.xfer = 0; break; case MODE_SENSE: break; - case WRITE_SAME: + case WRITE_SAME_10: req->cmd.xfer = 1; break; - case READ_CAPACITY: + case READ_CAPACITY_10: req->cmd.xfer = 8; break; case READ_BLOCK_LIMITS: @@ -264,7 +265,7 @@ static int scsi_req_length(SCSIRequest *req, uint8_t *cmd) req->cmd.xfer *= 8; break; case WRITE_10: - case WRITE_VERIFY: + case WRITE_VERIFY_10: case WRITE_6: case WRITE_12: case WRITE_VERIFY_12: @@ -324,7 +325,7 @@ static void scsi_req_xfer_mode(SCSIRequest *req) switch (req->cmd.buf[0]) { case WRITE_6: case WRITE_10: - case WRITE_VERIFY: + case WRITE_VERIFY_10: case WRITE_12: case WRITE_VERIFY_12: case WRITE_16: @@ -344,14 +345,13 @@ static void scsi_req_xfer_mode(SCSIRequest *req) case SEARCH_HIGH: case SEARCH_LOW: case UPDATE_BLOCK: - case WRITE_LONG: - case WRITE_SAME: + case WRITE_LONG_10: + case WRITE_SAME_10: case SEARCH_HIGH_12: case SEARCH_EQUAL_12: case SEARCH_LOW_12: case MEDIUM_SCAN: case SEND_VOLUME_TAG: - case WRITE_LONG_2: case PERSISTENT_RESERVE_OUT: case MAINTENANCE_OUT: req->cmd.mode = SCSI_XFER_TO_DEV; @@ -515,6 +515,7 @@ static const char *scsi_command_name(uint8_t cmd) { static const char *names[] = { [ TEST_UNIT_READY ] = "TEST_UNIT_READY", + [ REWIND ] = "REWIND", [ REQUEST_SENSE ] = "REQUEST_SENSE", [ FORMAT_UNIT ] = "FORMAT_UNIT", [ READ_BLOCK_LIMITS ] = "READ_BLOCK_LIMITS", @@ -539,13 +540,12 @@ static const char *scsi_command_name(uint8_t cmd) [ RECEIVE_DIAGNOSTIC ] = "RECEIVE_DIAGNOSTIC", [ SEND_DIAGNOSTIC ] = "SEND_DIAGNOSTIC", [ ALLOW_MEDIUM_REMOVAL ] = "ALLOW_MEDIUM_REMOVAL", - - [ READ_CAPACITY ] = "READ_CAPACITY", + [ READ_CAPACITY_10 ] = "READ_CAPACITY_10", [ READ_10 ] = "READ_10", [ WRITE_10 ] = "WRITE_10", [ SEEK_10 ] = "SEEK_10", - [ WRITE_VERIFY ] = "WRITE_VERIFY", - [ VERIFY ] = "VERIFY", + [ WRITE_VERIFY_10 ] = "WRITE_VERIFY_10", + [ VERIFY_10 ] = "VERIFY_10", [ SEARCH_HIGH ] = "SEARCH_HIGH", [ SEARCH_EQUAL ] = "SEARCH_EQUAL", [ SEARCH_LOW ] = "SEARCH_LOW", @@ -561,11 +561,14 @@ static const char *scsi_command_name(uint8_t cmd) [ WRITE_BUFFER ] = "WRITE_BUFFER", [ READ_BUFFER ] = "READ_BUFFER", [ UPDATE_BLOCK ] = "UPDATE_BLOCK", - [ READ_LONG ] = "READ_LONG", - [ WRITE_LONG ] = "WRITE_LONG", + [ READ_LONG_10 ] = "READ_LONG_10", + [ WRITE_LONG_10 ] = "WRITE_LONG_10", [ CHANGE_DEFINITION ] = "CHANGE_DEFINITION", - [ WRITE_SAME ] = "WRITE_SAME", + [ WRITE_SAME_10 ] = "WRITE_SAME_10", + [ UNMAP ] = "UNMAP", [ READ_TOC ] = "READ_TOC", + [ REPORT_DENSITY_SUPPORT ] = "REPORT_DENSITY_SUPPORT", + [ GET_CONFIGURATION ] = "GET_CONFIGURATION", [ LOG_SELECT ] = "LOG_SELECT", [ LOG_SENSE ] = "LOG_SENSE", [ MODE_SELECT_10 ] = "MODE_SELECT_10", @@ -574,27 +577,39 @@ static const char *scsi_command_name(uint8_t cmd) [ MODE_SENSE_10 ] = "MODE_SENSE_10", [ PERSISTENT_RESERVE_IN ] = "PERSISTENT_RESERVE_IN", [ PERSISTENT_RESERVE_OUT ] = "PERSISTENT_RESERVE_OUT", + [ WRITE_FILEMARKS_16 ] = "WRITE_FILEMARKS_16", + [ EXTENDED_COPY ] = "EXTENDED_COPY", + [ ATA_PASSTHROUGH ] = "ATA_PASSTHROUGH", + [ ACCESS_CONTROL_IN ] = "ACCESS_CONTROL_IN", + [ ACCESS_CONTROL_OUT ] = "ACCESS_CONTROL_OUT", + [ READ_16 ] = "READ_16", + [ COMPARE_AND_WRITE ] = "COMPARE_AND_WRITE", + [ WRITE_16 ] = "WRITE_16", + [ WRITE_VERIFY_16 ] = "WRITE_VERIFY_16", + [ VERIFY_16 ] = "VERIFY_16", + [ SYNCHRONIZE_CACHE_16 ] = "SYNCHRONIZE_CACHE_16", + [ LOCATE_16 ] = "LOCATE_16", + [ WRITE_SAME_16 ] = "WRITE_SAME_16", + [ ERASE_16 ] = "ERASE_16", + [ SERVICE_ACTION_IN ] = "SERVICE_ACTION_IN", + [ WRITE_LONG_16 ] = "WRITE_LONG_16", + [ REPORT_LUNS ] = "REPORT_LUNS", + [ BLANK ] = "BLANK", + [ MAINTENANCE_IN ] = "MAINTENANCE_IN", + [ MAINTENANCE_OUT ] = "MAINTENANCE_OUT", [ MOVE_MEDIUM ] = "MOVE_MEDIUM", + [ LOAD_UNLOAD ] = "LOAD_UNLOAD", [ READ_12 ] = "READ_12", [ WRITE_12 ] = "WRITE_12", [ WRITE_VERIFY_12 ] = "WRITE_VERIFY_12", + [ VERIFY_12 ] = "VERIFY_12", [ SEARCH_HIGH_12 ] = "SEARCH_HIGH_12", [ SEARCH_EQUAL_12 ] = "SEARCH_EQUAL_12", [ SEARCH_LOW_12 ] = "SEARCH_LOW_12", [ READ_ELEMENT_STATUS ] = "READ_ELEMENT_STATUS", [ SEND_VOLUME_TAG ] = "SEND_VOLUME_TAG", - [ WRITE_LONG_2 ] = "WRITE_LONG_2", - - [ REPORT_DENSITY_SUPPORT ] = "REPORT_DENSITY_SUPPORT", - [ GET_CONFIGURATION ] = "GET_CONFIGURATION", - [ READ_16 ] = "READ_16", - [ WRITE_16 ] = "WRITE_16", - [ WRITE_VERIFY_16 ] = "WRITE_VERIFY_16", - [ SERVICE_ACTION_IN ] = "SERVICE_ACTION_IN", - [ REPORT_LUNS ] = "REPORT_LUNS", - [ LOAD_UNLOAD ] = "LOAD_UNLOAD", + [ READ_DEFECT_DATA_12 ] = "READ_DEFECT_DATA_12", [ SET_CD_SPEED ] = "SET_CD_SPEED", - [ BLANK ] = "BLANK", }; if (cmd >= ARRAY_SIZE(names) || names[cmd] == NULL) diff --git a/hw/scsi-defs.h b/hw/scsi-defs.h index 1f40c5c8a5..f644860831 100644 --- a/hw/scsi-defs.h +++ b/hw/scsi-defs.h @@ -25,6 +25,7 @@ */ #define TEST_UNIT_READY 0x00 +#define REWIND 0x01 #define REQUEST_SENSE 0x03 #define FORMAT_UNIT 0x04 #define READ_BLOCK_LIMITS 0x05 @@ -47,13 +48,13 @@ #define RECEIVE_DIAGNOSTIC 0x1c #define SEND_DIAGNOSTIC 0x1d #define ALLOW_MEDIUM_REMOVAL 0x1e - -#define READ_CAPACITY 0x25 +#define READ_CAPACITY_10 0x25 #define READ_10 0x28 #define WRITE_10 0x2a #define SEEK_10 0x2b -#define WRITE_VERIFY 0x2e -#define VERIFY 0x2f +#define LOCATE_10 0x2b +#define WRITE_VERIFY_10 0x2e +#define VERIFY_10 0x2f #define SEARCH_HIGH 0x30 #define SEARCH_EQUAL 0x31 #define SEARCH_LOW 0x32 @@ -69,11 +70,14 @@ #define WRITE_BUFFER 0x3b #define READ_BUFFER 0x3c #define UPDATE_BLOCK 0x3d -#define READ_LONG 0x3e -#define WRITE_LONG 0x3f +#define READ_LONG_10 0x3e +#define WRITE_LONG_10 0x3f #define CHANGE_DEFINITION 0x40 -#define WRITE_SAME 0x41 +#define WRITE_SAME_10 0x41 +#define UNMAP 0x42 #define READ_TOC 0x43 +#define REPORT_DENSITY_SUPPORT 0x44 +#define GET_CONFIGURATION 0x46 #define LOG_SELECT 0x4c #define LOG_SENSE 0x4d #define MODE_SELECT_10 0x55 @@ -82,32 +86,40 @@ #define MODE_SENSE_10 0x5a #define PERSISTENT_RESERVE_IN 0x5e #define PERSISTENT_RESERVE_OUT 0x5f +#define VARLENGTH_CDB 0x7f +#define WRITE_FILEMARKS_16 0x80 +#define EXTENDED_COPY 0x83 +#define ATA_PASSTHROUGH 0x85 +#define ACCESS_CONTROL_IN 0x86 +#define ACCESS_CONTROL_OUT 0x87 +#define READ_16 0x88 +#define COMPARE_AND_WRITE 0x89 +#define WRITE_16 0x8a +#define WRITE_VERIFY_16 0x8e +#define VERIFY_16 0x8f +#define SYNCHRONIZE_CACHE_16 0x91 +#define LOCATE_16 0x92 #define WRITE_SAME_16 0x93 +#define ERASE_16 0x93 +#define SERVICE_ACTION_IN 0x9e +#define WRITE_LONG_16 0x9f +#define REPORT_LUNS 0xa0 +#define BLANK 0xa1 #define MAINTENANCE_IN 0xa3 #define MAINTENANCE_OUT 0xa4 #define MOVE_MEDIUM 0xa5 +#define LOAD_UNLOAD 0xa6 #define READ_12 0xa8 #define WRITE_12 0xaa #define WRITE_VERIFY_12 0xae +#define VERIFY_12 0xaf #define SEARCH_HIGH_12 0xb0 #define SEARCH_EQUAL_12 0xb1 #define SEARCH_LOW_12 0xb2 #define READ_ELEMENT_STATUS 0xb8 #define SEND_VOLUME_TAG 0xb6 -#define WRITE_LONG_2 0xea - -/* from hw/scsi-generic.c */ -#define REWIND 0x01 -#define REPORT_DENSITY_SUPPORT 0x44 -#define GET_CONFIGURATION 0x46 -#define READ_16 0x88 -#define WRITE_16 0x8a -#define WRITE_VERIFY_16 0x8e -#define SERVICE_ACTION_IN 0x9e -#define REPORT_LUNS 0xa0 -#define LOAD_UNLOAD 0xa6 -#define SET_CD_SPEED 0xbb -#define BLANK 0xa1 +#define READ_DEFECT_DATA_12 0xb7 +#define SET_CD_SPEED 0xbb /* * SAM Status codes diff --git a/hw/scsi-disk.c b/hw/scsi-disk.c index abf0bd21ec..03f244e066 100644 --- a/hw/scsi-disk.c +++ b/hw/scsi-disk.c @@ -885,7 +885,7 @@ static int scsi_disk_emulate_command(SCSIDiskReq *r, uint8_t *outbuf) case ALLOW_MEDIUM_REMOVAL: bdrv_set_locked(s->bs, req->cmd.buf[4] & 1); break; - case READ_CAPACITY: + case READ_CAPACITY_10: /* The normal LEN field for this command is zero. */ memset(outbuf, 0, 8); bdrv_get_geometry(s->bs, &nb_sectors); @@ -970,7 +970,7 @@ static int scsi_disk_emulate_command(SCSIDiskReq *r, uint8_t *outbuf) outbuf[3] = 8; buflen = 16; break; - case VERIFY: + case VERIFY_10: break; default: scsi_command_complete(r, CHECK_CONDITION, SENSE_CODE(INVALID_OPCODE)); @@ -1046,13 +1046,13 @@ static int32_t scsi_send_command(SCSIRequest *req, uint8_t *buf) case RELEASE_10: case START_STOP: case ALLOW_MEDIUM_REMOVAL: - case READ_CAPACITY: + case READ_CAPACITY_10: case SYNCHRONIZE_CACHE: case READ_TOC: case GET_CONFIGURATION: case SERVICE_ACTION_IN: case REPORT_LUNS: - case VERIFY: + case VERIFY_10: rc = scsi_disk_emulate_command(r, outbuf); if (rc < 0) { return 0; @@ -1075,7 +1075,7 @@ static int32_t scsi_send_command(SCSIRequest *req, uint8_t *buf) case WRITE_10: case WRITE_12: case WRITE_16: - case WRITE_VERIFY: + case WRITE_VERIFY_10: case WRITE_VERIFY_12: case WRITE_VERIFY_16: len = r->req.cmd.xfer / s->qdev.blocksize; diff --git a/hw/scsi-generic.c b/hw/scsi-generic.c index 63361b3542..7b0026eb98 100644 --- a/hw/scsi-generic.c +++ b/hw/scsi-generic.c @@ -406,7 +406,7 @@ static int get_blocksize(BlockDriverState *bdrv) memset(cmd, 0, sizeof(cmd)); memset(buf, 0, sizeof(buf)); - cmd[0] = READ_CAPACITY; + cmd[0] = READ_CAPACITY_10; memset(&io_header, 0, sizeof(io_header)); io_header.interface_id = 'S'; From f37bd73b76e7f1e300e6acfe1bb6d3b2bc63714b Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Fri, 22 Jul 2011 16:44:46 +0200 Subject: [PATCH 12/29] scsi-disk: Remove 'drive_kind' Instead of using its own definitions scsi-disk should be using the device type of the parent device. Signed-off-by: Hannes Reinecke Signed-off-by: Kevin Wolf --- hw/scsi-defs.h | 6 +++++- hw/scsi-disk.c | 46 ++++++++++++++++++++++------------------------ 2 files changed, 27 insertions(+), 25 deletions(-) diff --git a/hw/scsi-defs.h b/hw/scsi-defs.h index f644860831..27010b74c0 100644 --- a/hw/scsi-defs.h +++ b/hw/scsi-defs.h @@ -164,6 +164,7 @@ #define TYPE_DISK 0x00 #define TYPE_TAPE 0x01 +#define TYPE_PRINTER 0x02 #define TYPE_PROCESSOR 0x03 /* HP scanners use this */ #define TYPE_WORM 0x04 /* Treated as ROM by our system */ #define TYPE_ROM 0x05 @@ -171,6 +172,9 @@ #define TYPE_MOD 0x07 /* Magneto-optical disk - * - treated as TYPE_DISK */ #define TYPE_MEDIUM_CHANGER 0x08 -#define TYPE_ENCLOSURE 0x0d /* Enclosure Services Device */ +#define TYPE_STORAGE_ARRAY 0x0c /* Storage array device */ +#define TYPE_ENCLOSURE 0x0d /* Enclosure Services Device */ +#define TYPE_RBC 0x0e /* Simplified Direct-Access Device */ +#define TYPE_OSD 0x11 /* Object-storage Device */ #define TYPE_NO_LUN 0x7f diff --git a/hw/scsi-disk.c b/hw/scsi-disk.c index 03f244e066..fa198f928c 100644 --- a/hw/scsi-disk.c +++ b/hw/scsi-disk.c @@ -59,8 +59,6 @@ typedef struct SCSIDiskReq { uint32_t status; } SCSIDiskReq; -typedef enum { SCSI_HD, SCSI_CD } SCSIDriveKind; - struct SCSIDiskState { SCSIDevice qdev; @@ -74,7 +72,6 @@ struct SCSIDiskState char *version; char *serial; SCSISense sense; - SCSIDriveKind drive_kind; }; static int scsi_handle_rw_error(SCSIDiskReq *r, int error, int type); @@ -382,7 +379,7 @@ static int scsi_disk_emulate_inquiry(SCSIRequest *req, uint8_t *outbuf) return -1; } - if (s->drive_kind == SCSI_CD) { + if (s->qdev.type == TYPE_ROM) { outbuf[buflen++] = 5; } else { outbuf[buflen++] = 0; @@ -401,7 +398,7 @@ static int scsi_disk_emulate_inquiry(SCSIRequest *req, uint8_t *outbuf) if (s->serial) outbuf[buflen++] = 0x80; // unit serial number outbuf[buflen++] = 0x83; // device identification - if (s->drive_kind == SCSI_HD) { + if (s->qdev.type == TYPE_DISK) { outbuf[buflen++] = 0xb0; // block limits outbuf[buflen++] = 0xb2; // thin provisioning } @@ -460,7 +457,7 @@ static int scsi_disk_emulate_inquiry(SCSIRequest *req, uint8_t *outbuf) unsigned int opt_io_size = s->qdev.conf.opt_io_size / s->qdev.blocksize; - if (s->drive_kind == SCSI_CD) { + if (s->qdev.type == TYPE_ROM) { DPRINTF("Inquiry (EVPD[%02X] not supported for CDROM\n", page_code); return -1; @@ -530,12 +527,11 @@ static int scsi_disk_emulate_inquiry(SCSIRequest *req, uint8_t *outbuf) return buflen; } - if (s->drive_kind == SCSI_CD) { - outbuf[0] = 5; + outbuf[0] = s->qdev.type & 0x1f; + if (s->qdev.type == TYPE_ROM) { outbuf[1] = 0x80; memcpy(&outbuf[16], "QEMU CD-ROM ", 16); } else { - outbuf[0] = 0; outbuf[1] = s->removable ? 0x80 : 0; memcpy(&outbuf[16], "QEMU HARDDISK ", 16); } @@ -661,7 +657,7 @@ static int mode_sense_page(SCSIRequest *req, int page, uint8_t *p, return p[1] + 2; case 0x2a: /* CD Capabilities and Mechanical Status page. */ - if (s->drive_kind != SCSI_CD) + if (s->qdev.type != TYPE_ROM) return 0; p[0] = 0x2a; p[1] = 0x14; @@ -877,7 +873,7 @@ static int scsi_disk_emulate_command(SCSIDiskReq *r, uint8_t *outbuf) goto illegal_request; break; case START_STOP: - if (s->drive_kind == SCSI_CD && (req->cmd.buf[4] & 2)) { + if (s->qdev.type == TYPE_ROM && (req->cmd.buf[4] & 2)) { /* load/eject medium */ bdrv_eject(s->bs, !(req->cmd.buf[4] & 1)); } @@ -1183,7 +1179,7 @@ static void scsi_destroy(SCSIDevice *dev) blockdev_mark_auto_del(s->qdev.conf.bs); } -static int scsi_initfn(SCSIDevice *dev, SCSIDriveKind kind) +static int scsi_initfn(SCSIDevice *dev, uint8_t scsi_type) { SCSIDiskState *s = DO_UPCAST(SCSIDiskState, qdev, dev); DriveInfo *dinfo; @@ -1193,9 +1189,8 @@ static int scsi_initfn(SCSIDevice *dev, SCSIDriveKind kind) return -1; } s->bs = s->qdev.conf.bs; - s->drive_kind = kind; - if (kind == SCSI_HD && !bdrv_is_inserted(s->bs)) { + if (scsi_type == TYPE_DISK && !bdrv_is_inserted(s->bs)) { error_report("Device needs media, but drive is empty"); return -1; } @@ -1217,44 +1212,47 @@ static int scsi_initfn(SCSIDevice *dev, SCSIDriveKind kind) return -1; } - if (kind == SCSI_CD) { + if (scsi_type == TYPE_ROM) { s->qdev.blocksize = 2048; - } else { + } else if (scsi_type == TYPE_DISK) { s->qdev.blocksize = s->qdev.conf.logical_block_size; + } else { + error_report("scsi-disk: Unhandled SCSI type %02x", scsi_type); + return -1; } s->cluster_size = s->qdev.blocksize / 512; s->bs->buffer_alignment = s->qdev.blocksize; - s->qdev.type = TYPE_DISK; + s->qdev.type = scsi_type; qemu_add_vm_change_state_handler(scsi_dma_restart_cb, s); - bdrv_set_removable(s->bs, kind == SCSI_CD); + bdrv_set_removable(s->bs, scsi_type == TYPE_ROM); add_boot_device_path(s->qdev.conf.bootindex, &dev->qdev, ",0"); return 0; } static int scsi_hd_initfn(SCSIDevice *dev) { - return scsi_initfn(dev, SCSI_HD); + return scsi_initfn(dev, TYPE_DISK); } static int scsi_cd_initfn(SCSIDevice *dev) { - return scsi_initfn(dev, SCSI_CD); + return scsi_initfn(dev, TYPE_ROM); } static int scsi_disk_initfn(SCSIDevice *dev) { - SCSIDriveKind kind; DriveInfo *dinfo; + uint8_t scsi_type; if (!dev->conf.bs) { - kind = SCSI_HD; /* will die in scsi_initfn() */ + scsi_type = TYPE_DISK; /* will die in scsi_initfn() */ } else { dinfo = drive_get_by_blockdev(dev->conf.bs); - kind = dinfo->media_cd ? SCSI_CD : SCSI_HD; + scsi_type = dinfo->media_cd ? TYPE_ROM : TYPE_DISK; } - return scsi_initfn(dev, kind); + return scsi_initfn(dev, scsi_type); } #define DEFINE_SCSI_DISK_PROPERTIES() \ From 5bf3f8e4f71e46f80b76dc8a03cc0c37cad5cde0 Mon Sep 17 00:00:00 2001 From: Frediano Ziglio Date: Wed, 27 Jul 2011 20:12:00 +0200 Subject: [PATCH 13/29] block: Removed unused function bdrv_write_sync Signed-off-by: Frediano Ziglio Signed-off-by: Kevin Wolf --- block.c | 13 ------------- block.h | 2 -- 2 files changed, 15 deletions(-) diff --git a/block.c b/block.c index 8859f9b414..4c66b2cf40 100644 --- a/block.c +++ b/block.c @@ -1110,19 +1110,6 @@ int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset, return 0; } -/* - * Writes to the file and ensures that no writes are reordered across this - * request (acts as a barrier) - * - * Returns 0 on success, -errno in error cases. - */ -int bdrv_write_sync(BlockDriverState *bs, int64_t sector_num, - const uint8_t *buf, int nb_sectors) -{ - return bdrv_pwrite_sync(bs, BDRV_SECTOR_SIZE * sector_num, - buf, BDRV_SECTOR_SIZE * nb_sectors); -} - /** * Truncate file to 'offset' bytes (needed only for file protocols) */ diff --git a/block.h b/block.h index 59cc410e3b..e672bc669a 100644 --- a/block.h +++ b/block.h @@ -85,8 +85,6 @@ int bdrv_pwrite(BlockDriverState *bs, int64_t offset, const void *buf, int count); int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset, const void *buf, int count); -int bdrv_write_sync(BlockDriverState *bs, int64_t sector_num, - const uint8_t *buf, int nb_sectors); int bdrv_truncate(BlockDriverState *bs, int64_t offset); int64_t bdrv_getlength(BlockDriverState *bs); int64_t bdrv_get_allocated_file_size(BlockDriverState *bs); From c1ee7d56f3f2c8c5fb90452f4df72fa402f61527 Mon Sep 17 00:00:00 2001 From: Frediano Ziglio Date: Wed, 27 Jul 2011 20:12:01 +0200 Subject: [PATCH 14/29] raw-posix: Typo fix Signed-off-by: Frediano Ziglio Signed-off-by: Kevin Wolf --- block/raw-posix.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/raw-posix.c b/block/raw-posix.c index 6672d31da3..6dd708688b 100644 --- a/block/raw-posix.c +++ b/block/raw-posix.c @@ -587,7 +587,7 @@ static BlockDriverAIOCB *raw_aio_submit(BlockDriverState *bs, /* * If O_DIRECT is used the buffer needs to be aligned on a sector - * boundary. Check if this is the case or telll the low-level + * boundary. Check if this is the case or tell the low-level * driver that it needs to copy the buffer. */ if (s->aligned_buf) { From f6e8ffc22fe153ba981f2747e4c52ea7e55f6ecc Mon Sep 17 00:00:00 2001 From: Frediano Ziglio Date: Wed, 27 Jul 2011 20:12:02 +0200 Subject: [PATCH 15/29] raw-posix: Always check paio_init result Signed-off-by: Frediano Ziglio Signed-off-by: Kevin Wolf --- block/raw-posix.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/block/raw-posix.c b/block/raw-posix.c index 6dd708688b..c5c99446c0 100644 --- a/block/raw-posix.c +++ b/block/raw-posix.c @@ -230,13 +230,15 @@ static int raw_open_common(BlockDriverState *bs, const char *filename, } } + /* We're falling back to POSIX AIO in some cases so init always */ + if (paio_init() < 0) { + goto out_free_buf; + } + #ifdef CONFIG_LINUX_AIO if ((bdrv_flags & (BDRV_O_NOCACHE|BDRV_O_NATIVE_AIO)) == (BDRV_O_NOCACHE|BDRV_O_NATIVE_AIO)) { - /* We're falling back to POSIX AIO in some cases */ - paio_init(); - s->aio_ctx = laio_init(); if (!s->aio_ctx) { goto out_free_buf; @@ -245,9 +247,6 @@ static int raw_open_common(BlockDriverState *bs, const char *filename, } else #endif { - if (paio_init() < 0) { - goto out_free_buf; - } #ifdef CONFIG_LINUX_AIO s->use_aio = 0; #endif From 00dccaf1f848290d979a4b1e6248281ce1b32aaa Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Mon, 17 Jan 2011 16:08:14 +0000 Subject: [PATCH 16/29] coroutine: introduce coroutines Asynchronous code is becoming very complex. At the same time synchronous code is growing because it is convenient to write. Sometimes duplicate code paths are even added, one synchronous and the other asynchronous. This patch introduces coroutines which allow code that looks synchronous but is asynchronous under the covers. A coroutine has its own stack and is therefore able to preserve state across blocking operations, which traditionally require callback functions and manual marshalling of parameters. Creating and starting a coroutine is easy: coroutine = qemu_coroutine_create(my_coroutine); qemu_coroutine_enter(coroutine, my_data); The coroutine then executes until it returns or yields: void coroutine_fn my_coroutine(void *opaque) { MyData *my_data = opaque; /* do some work */ qemu_coroutine_yield(); /* do some more work */ } Yielding switches control back to the caller of qemu_coroutine_enter(). This is typically used to switch back to the main thread's event loop after issuing an asynchronous I/O request. The request callback will then invoke qemu_coroutine_enter() once more to switch back to the coroutine. Note that if coroutines are used only from threads which hold the global mutex they will never execute concurrently. This makes programming with coroutines easier than with threads. Race conditions cannot occur since only one coroutine may be active at any time. Other coroutines can only run across yield. This coroutines implementation is based on the gtk-vnc implementation written by Anthony Liguori but it has been significantly rewritten by Kevin Wolf to use setjmp()/longjmp() instead of the more expensive swapcontext() and by Paolo Bonzini for Windows Fibers support. Signed-off-by: Kevin Wolf Signed-off-by: Stefan Hajnoczi --- Makefile.objs | 7 ++ coroutine-ucontext.c | 230 +++++++++++++++++++++++++++++++++++++++++++ coroutine-win32.c | 92 +++++++++++++++++ qemu-coroutine-int.h | 48 +++++++++ qemu-coroutine.c | 75 ++++++++++++++ qemu-coroutine.h | 95 ++++++++++++++++++ trace-events | 5 + 7 files changed, 552 insertions(+) create mode 100644 coroutine-ucontext.c create mode 100644 coroutine-win32.c create mode 100644 qemu-coroutine-int.h create mode 100644 qemu-coroutine.c create mode 100644 qemu-coroutine.h diff --git a/Makefile.objs b/Makefile.objs index 6991a9f52a..28e1762463 100644 --- a/Makefile.objs +++ b/Makefile.objs @@ -10,6 +10,12 @@ oslib-obj-y = osdep.o oslib-obj-$(CONFIG_WIN32) += oslib-win32.o qemu-thread-win32.o oslib-obj-$(CONFIG_POSIX) += oslib-posix.o qemu-thread-posix.o +####################################################################### +# coroutines +coroutine-obj-y = qemu-coroutine.o +coroutine-obj-$(CONFIG_POSIX) += coroutine-ucontext.o +coroutine-obj-$(CONFIG_WIN32) += coroutine-win32.o + ####################################################################### # block-obj-y is code used by both qemu system emulation and qemu-img @@ -69,6 +75,7 @@ common-obj-y += readline.o console.o cursor.o qemu-error.o common-obj-y += $(oslib-obj-y) common-obj-$(CONFIG_WIN32) += os-win32.o common-obj-$(CONFIG_POSIX) += os-posix.o +common-obj-y += $(coroutine-obj-y) common-obj-y += tcg-runtime.o host-utils.o common-obj-y += irq.o ioport.o input.o diff --git a/coroutine-ucontext.c b/coroutine-ucontext.c new file mode 100644 index 0000000000..41c2379a2a --- /dev/null +++ b/coroutine-ucontext.c @@ -0,0 +1,230 @@ +/* + * ucontext coroutine initialization code + * + * Copyright (C) 2006 Anthony Liguori + * Copyright (C) 2011 Kevin Wolf + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.0 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, see . + */ + +/* XXX Is there a nicer way to disable glibc's stack check for longjmp? */ +#ifdef _FORTIFY_SOURCE +#undef _FORTIFY_SOURCE +#endif +#include +#include +#include +#include +#include +#include "qemu-common.h" +#include "qemu-coroutine-int.h" + +enum { + /* Maximum free pool size prevents holding too many freed coroutines */ + POOL_MAX_SIZE = 64, +}; + +typedef struct { + Coroutine base; + void *stack; + jmp_buf env; +} CoroutineUContext; + +/** + * Per-thread coroutine bookkeeping + */ +typedef struct { + /** Currently executing coroutine */ + Coroutine *current; + + /** Free list to speed up creation */ + QLIST_HEAD(, Coroutine) pool; + unsigned int pool_size; + + /** The default coroutine */ + CoroutineUContext leader; +} CoroutineThreadState; + +static pthread_key_t thread_state_key; + +/* + * va_args to makecontext() must be type 'int', so passing + * the pointer we need may require several int args. This + * union is a quick hack to let us do that + */ +union cc_arg { + void *p; + int i[2]; +}; + +static CoroutineThreadState *coroutine_get_thread_state(void) +{ + CoroutineThreadState *s = pthread_getspecific(thread_state_key); + + if (!s) { + s = qemu_mallocz(sizeof(*s)); + s->current = &s->leader.base; + QLIST_INIT(&s->pool); + pthread_setspecific(thread_state_key, s); + } + return s; +} + +static void qemu_coroutine_thread_cleanup(void *opaque) +{ + CoroutineThreadState *s = opaque; + Coroutine *co; + Coroutine *tmp; + + QLIST_FOREACH_SAFE(co, &s->pool, pool_next, tmp) { + qemu_free(DO_UPCAST(CoroutineUContext, base, co)->stack); + qemu_free(co); + } + qemu_free(s); +} + +static void __attribute__((constructor)) coroutine_init(void) +{ + int ret; + + ret = pthread_key_create(&thread_state_key, qemu_coroutine_thread_cleanup); + if (ret != 0) { + fprintf(stderr, "unable to create leader key: %s\n", strerror(errno)); + abort(); + } +} + +static void coroutine_trampoline(int i0, int i1) +{ + union cc_arg arg; + CoroutineUContext *self; + Coroutine *co; + + arg.i[0] = i0; + arg.i[1] = i1; + self = arg.p; + co = &self->base; + + /* Initialize longjmp environment and switch back the caller */ + if (!setjmp(self->env)) { + longjmp(*(jmp_buf *)co->entry_arg, 1); + } + + while (true) { + co->entry(co->entry_arg); + qemu_coroutine_switch(co, co->caller, COROUTINE_TERMINATE); + } +} + +static Coroutine *coroutine_new(void) +{ + const size_t stack_size = 1 << 20; + CoroutineUContext *co; + ucontext_t old_uc, uc; + jmp_buf old_env; + union cc_arg arg; + + /* The ucontext functions preserve signal masks which incurs a system call + * overhead. setjmp()/longjmp() does not preserve signal masks but only + * works on the current stack. Since we need a way to create and switch to + * a new stack, use the ucontext functions for that but setjmp()/longjmp() + * for everything else. + */ + + if (getcontext(&uc) == -1) { + abort(); + } + + co = qemu_mallocz(sizeof(*co)); + co->stack = qemu_malloc(stack_size); + co->base.entry_arg = &old_env; /* stash away our jmp_buf */ + + uc.uc_link = &old_uc; + uc.uc_stack.ss_sp = co->stack; + uc.uc_stack.ss_size = stack_size; + uc.uc_stack.ss_flags = 0; + + arg.p = co; + + makecontext(&uc, (void (*)(void))coroutine_trampoline, + 2, arg.i[0], arg.i[1]); + + /* swapcontext() in, longjmp() back out */ + if (!setjmp(old_env)) { + swapcontext(&old_uc, &uc); + } + return &co->base; +} + +Coroutine *qemu_coroutine_new(void) +{ + CoroutineThreadState *s = coroutine_get_thread_state(); + Coroutine *co; + + co = QLIST_FIRST(&s->pool); + if (co) { + QLIST_REMOVE(co, pool_next); + s->pool_size--; + } else { + co = coroutine_new(); + } + return co; +} + +void qemu_coroutine_delete(Coroutine *co_) +{ + CoroutineThreadState *s = coroutine_get_thread_state(); + CoroutineUContext *co = DO_UPCAST(CoroutineUContext, base, co_); + + if (s->pool_size < POOL_MAX_SIZE) { + QLIST_INSERT_HEAD(&s->pool, &co->base, pool_next); + co->base.caller = NULL; + s->pool_size++; + return; + } + + qemu_free(co->stack); + qemu_free(co); +} + +CoroutineAction qemu_coroutine_switch(Coroutine *from_, Coroutine *to_, + CoroutineAction action) +{ + CoroutineUContext *from = DO_UPCAST(CoroutineUContext, base, from_); + CoroutineUContext *to = DO_UPCAST(CoroutineUContext, base, to_); + CoroutineThreadState *s = coroutine_get_thread_state(); + int ret; + + s->current = to_; + + ret = setjmp(from->env); + if (ret == 0) { + longjmp(to->env, action); + } + return ret; +} + +Coroutine *qemu_coroutine_self(void) +{ + CoroutineThreadState *s = coroutine_get_thread_state(); + + return s->current; +} + +bool qemu_in_coroutine(void) +{ + CoroutineThreadState *s = pthread_getspecific(thread_state_key); + + return s && s->current->caller; +} diff --git a/coroutine-win32.c b/coroutine-win32.c new file mode 100644 index 0000000000..0e29448473 --- /dev/null +++ b/coroutine-win32.c @@ -0,0 +1,92 @@ +/* + * Win32 coroutine initialization code + * + * Copyright (c) 2011 Kevin Wolf + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "qemu-common.h" +#include "qemu-coroutine-int.h" + +typedef struct +{ + Coroutine base; + + LPVOID fiber; + CoroutineAction action; +} CoroutineWin32; + +static __thread CoroutineWin32 leader; +static __thread Coroutine *current; + +CoroutineAction qemu_coroutine_switch(Coroutine *from_, Coroutine *to_, + CoroutineAction action) +{ + CoroutineWin32 *from = DO_UPCAST(CoroutineWin32, base, from_); + CoroutineWin32 *to = DO_UPCAST(CoroutineWin32, base, to_); + + current = to_; + + to->action = action; + SwitchToFiber(to->fiber); + return from->action; +} + +static void CALLBACK coroutine_trampoline(void *co_) +{ + Coroutine *co = co_; + + while (true) { + co->entry(co->entry_arg); + qemu_coroutine_switch(co, co->caller, COROUTINE_TERMINATE); + } +} + +Coroutine *qemu_coroutine_new(void) +{ + const size_t stack_size = 1 << 20; + CoroutineWin32 *co; + + co = qemu_mallocz(sizeof(*co)); + co->fiber = CreateFiber(stack_size, coroutine_trampoline, &co->base); + return &co->base; +} + +void qemu_coroutine_delete(Coroutine *co_) +{ + CoroutineWin32 *co = DO_UPCAST(CoroutineWin32, base, co_); + + DeleteFiber(co->fiber); + qemu_free(co); +} + +Coroutine *qemu_coroutine_self(void) +{ + if (!current) { + current = &leader.base; + leader.fiber = ConvertThreadToFiber(NULL); + } + return current; +} + +bool qemu_in_coroutine(void) +{ + return current && current->caller; +} diff --git a/qemu-coroutine-int.h b/qemu-coroutine-int.h new file mode 100644 index 0000000000..64915c2fa5 --- /dev/null +++ b/qemu-coroutine-int.h @@ -0,0 +1,48 @@ +/* + * Coroutine internals + * + * Copyright (c) 2011 Kevin Wolf + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#ifndef QEMU_COROUTINE_INT_H +#define QEMU_COROUTINE_INT_H + +#include "qemu-queue.h" +#include "qemu-coroutine.h" + +typedef enum { + COROUTINE_YIELD = 1, + COROUTINE_TERMINATE = 2, +} CoroutineAction; + +struct Coroutine { + CoroutineEntry *entry; + void *entry_arg; + Coroutine *caller; + QLIST_ENTRY(Coroutine) pool_next; +}; + +Coroutine *qemu_coroutine_new(void); +void qemu_coroutine_delete(Coroutine *co); +CoroutineAction qemu_coroutine_switch(Coroutine *from, Coroutine *to, + CoroutineAction action); + +#endif diff --git a/qemu-coroutine.c b/qemu-coroutine.c new file mode 100644 index 0000000000..600be2643c --- /dev/null +++ b/qemu-coroutine.c @@ -0,0 +1,75 @@ +/* + * QEMU coroutines + * + * Copyright IBM, Corp. 2011 + * + * Authors: + * Stefan Hajnoczi + * Kevin Wolf + * + * This work is licensed under the terms of the GNU LGPL, version 2 or later. + * See the COPYING.LIB file in the top-level directory. + * + */ + +#include "trace.h" +#include "qemu-common.h" +#include "qemu-coroutine.h" +#include "qemu-coroutine-int.h" + +Coroutine *qemu_coroutine_create(CoroutineEntry *entry) +{ + Coroutine *co = qemu_coroutine_new(); + co->entry = entry; + return co; +} + +static void coroutine_swap(Coroutine *from, Coroutine *to) +{ + CoroutineAction ret; + + ret = qemu_coroutine_switch(from, to, COROUTINE_YIELD); + + switch (ret) { + case COROUTINE_YIELD: + return; + case COROUTINE_TERMINATE: + trace_qemu_coroutine_terminate(to); + qemu_coroutine_delete(to); + return; + default: + abort(); + } +} + +void qemu_coroutine_enter(Coroutine *co, void *opaque) +{ + Coroutine *self = qemu_coroutine_self(); + + trace_qemu_coroutine_enter(self, co, opaque); + + if (co->caller) { + fprintf(stderr, "Co-routine re-entered recursively\n"); + abort(); + } + + co->caller = self; + co->entry_arg = opaque; + coroutine_swap(self, co); +} + +void coroutine_fn qemu_coroutine_yield(void) +{ + Coroutine *self = qemu_coroutine_self(); + Coroutine *to = self->caller; + + trace_qemu_coroutine_yield(self, to); + + if (!to) { + fprintf(stderr, "Co-routine is yielding to no one\n"); + abort(); + } + + self->caller = NULL; + coroutine_swap(self, to); +} diff --git a/qemu-coroutine.h b/qemu-coroutine.h new file mode 100644 index 0000000000..08255c7c41 --- /dev/null +++ b/qemu-coroutine.h @@ -0,0 +1,95 @@ +/* + * QEMU coroutine implementation + * + * Copyright IBM, Corp. 2011 + * + * Authors: + * Stefan Hajnoczi + * + * This work is licensed under the terms of the GNU LGPL, version 2 or later. + * See the COPYING.LIB file in the top-level directory. + * + */ + +#ifndef QEMU_COROUTINE_H +#define QEMU_COROUTINE_H + +#include + +/** + * Coroutines are a mechanism for stack switching and can be used for + * cooperative userspace threading. These functions provide a simple but + * useful flavor of coroutines that is suitable for writing sequential code, + * rather than callbacks, for operations that need to give up control while + * waiting for events to complete. + * + * These functions are re-entrant and may be used outside the global mutex. + */ + +/** + * Mark a function that executes in coroutine context + * + * Functions that execute in coroutine context cannot be called directly from + * normal functions. In the future it would be nice to enable compiler or + * static checker support for catching such errors. This annotation might make + * it possible and in the meantime it serves as documentation. + * + * For example: + * + * static void coroutine_fn foo(void) { + * .... + * } + */ +#define coroutine_fn + +typedef struct Coroutine Coroutine; + +/** + * Coroutine entry point + * + * When the coroutine is entered for the first time, opaque is passed in as an + * argument. + * + * When this function returns, the coroutine is destroyed automatically and + * execution continues in the caller who last entered the coroutine. + */ +typedef void coroutine_fn CoroutineEntry(void *opaque); + +/** + * Create a new coroutine + * + * Use qemu_coroutine_enter() to actually transfer control to the coroutine. + */ +Coroutine *qemu_coroutine_create(CoroutineEntry *entry); + +/** + * Transfer control to a coroutine + * + * The opaque argument is passed as the argument to the entry point when + * entering the coroutine for the first time. It is subsequently ignored. + */ +void qemu_coroutine_enter(Coroutine *coroutine, void *opaque); + +/** + * Transfer control back to a coroutine's caller + * + * This function does not return until the coroutine is re-entered using + * qemu_coroutine_enter(). + */ +void coroutine_fn qemu_coroutine_yield(void); + +/** + * Get the currently executing coroutine + */ +Coroutine *coroutine_fn qemu_coroutine_self(void); + +/** + * Return whether or not currently inside a coroutine + * + * This can be used to write functions that work both when in coroutine context + * and when not in coroutine context. Note that such functions cannot use the + * coroutine_fn annotation since they work outside coroutine context. + */ +bool qemu_in_coroutine(void); + +#endif /* QEMU_COROUTINE_H */ diff --git a/trace-events b/trace-events index 713f042081..136f7759f2 100644 --- a/trace-events +++ b/trace-events @@ -425,3 +425,8 @@ disable qemu_put_ram_ptr(void* addr) "%p" # hw/xen_platform.c disable xen_platform_log(char *s) "xen platform: %s" + +# qemu-coroutine.c +disable qemu_coroutine_enter(void *from, void *to, void *opaque) "from %p to %p opaque %p" +disable qemu_coroutine_yield(void *from, void *to) "from %p to %p" +disable qemu_coroutine_terminate(void *co) "self %p" From d0e2fce5366bff40cf2bf48d2f6ff72b4d8de124 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Thu, 9 Jun 2011 23:11:06 +0530 Subject: [PATCH 17/29] coroutine: implement coroutines using gthread MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On platforms that don't support makecontext(3) use gthread based coroutine implementation. Darwin has makecontext(3) but getcontext(3) is stubbed out to return ENOTSUP. Andreas Färber debugged this and contributed the ./configure test which solves the issue for Darwin/ppc64 (and ppc) v10.5. [Original patch by Aneesh, made consistent with coroutine-ucontext.c and switched to GStaticPrivate by Stefan. Tested on Linux and OpenBSD.] Signed-off-by: Aneesh Kumar K.V Signed-off-by: Stefan Hajnoczi --- Makefile.objs | 4 ++ configure | 18 ++++++ coroutine-gthread.c | 131 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 153 insertions(+) create mode 100644 coroutine-gthread.c diff --git a/Makefile.objs b/Makefile.objs index 28e1762463..5679e1fa06 100644 --- a/Makefile.objs +++ b/Makefile.objs @@ -13,7 +13,11 @@ oslib-obj-$(CONFIG_POSIX) += oslib-posix.o qemu-thread-posix.o ####################################################################### # coroutines coroutine-obj-y = qemu-coroutine.o +ifeq ($(CONFIG_UCONTEXT_COROUTINE),y) coroutine-obj-$(CONFIG_POSIX) += coroutine-ucontext.o +else +coroutine-obj-$(CONFIG_POSIX) += coroutine-gthread.o +endif coroutine-obj-$(CONFIG_WIN32) += coroutine-win32.o ####################################################################### diff --git a/configure b/configure index 77194cf9a7..1eed0cd585 100755 --- a/configure +++ b/configure @@ -2540,6 +2540,20 @@ EOF fi fi +########################################## +# check if we have makecontext + +ucontext_coroutine=no +if test "$darwin" != "yes"; then + cat > $TMPC << EOF +#include +int main(void) { makecontext(0, 0, 0); } +EOF + if compile_prog "" "" ; then + ucontext_coroutine=yes + fi +fi + ########################################## # End of CC checks # After here, no more $cc or $ld runs @@ -3015,6 +3029,10 @@ if test "$rbd" = "yes" ; then echo "CONFIG_RBD=y" >> $config_host_mak fi +if test "$ucontext_coroutine" = "yes" ; then + echo "CONFIG_UCONTEXT_COROUTINE=y" >> $config_host_mak +fi + # USB host support case "$usb" in linux) diff --git a/coroutine-gthread.c b/coroutine-gthread.c new file mode 100644 index 0000000000..f09877e14f --- /dev/null +++ b/coroutine-gthread.c @@ -0,0 +1,131 @@ +/* + * GThread coroutine initialization code + * + * Copyright (C) 2006 Anthony Liguori + * Copyright (C) 2011 Aneesh Kumar K.V + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.0 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, see . + */ + +#include +#include "qemu-common.h" +#include "qemu-coroutine-int.h" + +typedef struct { + Coroutine base; + GThread *thread; + bool runnable; + CoroutineAction action; +} CoroutineGThread; + +static GCond *coroutine_cond; +static GStaticMutex coroutine_lock = G_STATIC_MUTEX_INIT; +static GStaticPrivate coroutine_key = G_STATIC_PRIVATE_INIT; + +static void __attribute__((constructor)) coroutine_init(void) +{ + if (!g_thread_supported()) { + g_thread_init(NULL); + } + + coroutine_cond = g_cond_new(); +} + +static void coroutine_wait_runnable_locked(CoroutineGThread *co) +{ + while (!co->runnable) { + g_cond_wait(coroutine_cond, g_static_mutex_get_mutex(&coroutine_lock)); + } +} + +static void coroutine_wait_runnable(CoroutineGThread *co) +{ + g_static_mutex_lock(&coroutine_lock); + coroutine_wait_runnable_locked(co); + g_static_mutex_unlock(&coroutine_lock); +} + +static gpointer coroutine_thread(gpointer opaque) +{ + CoroutineGThread *co = opaque; + + g_static_private_set(&coroutine_key, co, NULL); + coroutine_wait_runnable(co); + co->base.entry(co->base.entry_arg); + qemu_coroutine_switch(&co->base, co->base.caller, COROUTINE_TERMINATE); + return NULL; +} + +Coroutine *qemu_coroutine_new(void) +{ + CoroutineGThread *co; + + co = qemu_mallocz(sizeof(*co)); + co->thread = g_thread_create_full(coroutine_thread, co, 0, TRUE, TRUE, + G_THREAD_PRIORITY_NORMAL, NULL); + if (!co->thread) { + qemu_free(co); + return NULL; + } + return &co->base; +} + +void qemu_coroutine_delete(Coroutine *co_) +{ + CoroutineGThread *co = DO_UPCAST(CoroutineGThread, base, co_); + + g_thread_join(co->thread); + qemu_free(co); +} + +CoroutineAction qemu_coroutine_switch(Coroutine *from_, + Coroutine *to_, + CoroutineAction action) +{ + CoroutineGThread *from = DO_UPCAST(CoroutineGThread, base, from_); + CoroutineGThread *to = DO_UPCAST(CoroutineGThread, base, to_); + + g_static_mutex_lock(&coroutine_lock); + from->runnable = false; + from->action = action; + to->runnable = true; + to->action = action; + g_cond_broadcast(coroutine_cond); + + if (action != COROUTINE_TERMINATE) { + coroutine_wait_runnable_locked(from); + } + g_static_mutex_unlock(&coroutine_lock); + return from->action; +} + +Coroutine *qemu_coroutine_self(void) +{ + CoroutineGThread *co = g_static_private_get(&coroutine_key); + + if (!co) { + co = qemu_mallocz(sizeof(*co)); + co->runnable = true; + g_static_private_set(&coroutine_key, co, (GDestroyNotify)qemu_free); + } + + return &co->base; +} + +bool qemu_in_coroutine(void) +{ + CoroutineGThread *co = g_static_private_get(&coroutine_key); + + return co && co->base.caller; +} From aa7ee42ed3b2c196de9f390fc9a5e339dd7417e3 Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Tue, 10 May 2011 10:21:18 +0100 Subject: [PATCH 18/29] coroutine: add test-coroutine automated tests To run automated tests for coroutines: make test-coroutine ./test-coroutine On success the program terminates with exit status 0. On failure an error message is written to stderr and the program exits with exit status 1. Signed-off-by: Stefan Hajnoczi --- .gitignore | 1 + Makefile | 3 +- test-coroutine.c | 162 +++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 165 insertions(+), 1 deletion(-) create mode 100644 test-coroutine.c diff --git a/.gitignore b/.gitignore index 54835bcb97..59c343c414 100644 --- a/.gitignore +++ b/.gitignore @@ -36,6 +36,7 @@ qemu-io qemu-ga qemu-monitor.texi QMP/qmp-commands.txt +test-coroutine .gdbinit *.a *.aux diff --git a/Makefile b/Makefile index 48552512d6..2becedcf88 100644 --- a/Makefile +++ b/Makefile @@ -151,7 +151,7 @@ qemu-io$(EXESUF): qemu-io.o cmd.o qemu-tool.o qemu-error.o $(oslib-obj-y) $(trac qemu-img-cmds.h: $(SRC_PATH)/qemu-img-cmds.hx $(call quiet-command,sh $(SRC_PATH)/scripts/hxtool -h < $< > $@," GEN $@") -check-qint.o check-qstring.o check-qdict.o check-qlist.o check-qfloat.o check-qjson.o: $(GENERATED_HEADERS) +check-qint.o check-qstring.o check-qdict.o check-qlist.o check-qfloat.o check-qjson.o test-coroutine.o: $(GENERATED_HEADERS) CHECK_PROG_DEPS = qemu-malloc.o $(oslib-obj-y) $(trace-obj-y) qemu-tool.o @@ -161,6 +161,7 @@ check-qdict: check-qdict.o qdict.o qfloat.o qint.o qstring.o qbool.o qlist.o $(C check-qlist: check-qlist.o qlist.o qint.o $(CHECK_PROG_DEPS) check-qfloat: check-qfloat.o qfloat.o $(CHECK_PROG_DEPS) check-qjson: check-qjson.o qfloat.o qint.o qdict.o qstring.o qlist.o qbool.o qjson.o json-streamer.o json-lexer.o json-parser.o error.o qerror.o qemu-error.o $(CHECK_PROG_DEPS) +test-coroutine: test-coroutine.o qemu-timer-common.o async.o $(coroutine-obj-y) $(CHECK_PROG_DEPS) $(qapi-obj-y): $(GENERATED_HEADERS) qapi-dir := qapi-generated diff --git a/test-coroutine.c b/test-coroutine.c new file mode 100644 index 0000000000..9e9d3c95bc --- /dev/null +++ b/test-coroutine.c @@ -0,0 +1,162 @@ +/* + * Coroutine tests + * + * Copyright IBM, Corp. 2011 + * + * Authors: + * Stefan Hajnoczi + * + * This work is licensed under the terms of the GNU LGPL, version 2 or later. + * See the COPYING.LIB file in the top-level directory. + * + */ + +#include +#include "qemu-coroutine.h" + +/* + * Check that qemu_in_coroutine() works + */ + +static void coroutine_fn verify_in_coroutine(void *opaque) +{ + g_assert(qemu_in_coroutine()); +} + +static void test_in_coroutine(void) +{ + Coroutine *coroutine; + + g_assert(!qemu_in_coroutine()); + + coroutine = qemu_coroutine_create(verify_in_coroutine); + qemu_coroutine_enter(coroutine, NULL); +} + +/* + * Check that qemu_coroutine_self() works + */ + +static void coroutine_fn verify_self(void *opaque) +{ + g_assert(qemu_coroutine_self() == opaque); +} + +static void test_self(void) +{ + Coroutine *coroutine; + + coroutine = qemu_coroutine_create(verify_self); + qemu_coroutine_enter(coroutine, coroutine); +} + +/* + * Check that coroutines may nest multiple levels + */ + +typedef struct { + unsigned int n_enter; /* num coroutines entered */ + unsigned int n_return; /* num coroutines returned */ + unsigned int max; /* maximum level of nesting */ +} NestData; + +static void coroutine_fn nest(void *opaque) +{ + NestData *nd = opaque; + + nd->n_enter++; + + if (nd->n_enter < nd->max) { + Coroutine *child; + + child = qemu_coroutine_create(nest); + qemu_coroutine_enter(child, nd); + } + + nd->n_return++; +} + +static void test_nesting(void) +{ + Coroutine *root; + NestData nd = { + .n_enter = 0, + .n_return = 0, + .max = 128, + }; + + root = qemu_coroutine_create(nest); + qemu_coroutine_enter(root, &nd); + + /* Must enter and return from max nesting level */ + g_assert_cmpint(nd.n_enter, ==, nd.max); + g_assert_cmpint(nd.n_return, ==, nd.max); +} + +/* + * Check that yield/enter transfer control correctly + */ + +static void coroutine_fn yield_5_times(void *opaque) +{ + bool *done = opaque; + int i; + + for (i = 0; i < 5; i++) { + qemu_coroutine_yield(); + } + *done = true; +} + +static void test_yield(void) +{ + Coroutine *coroutine; + bool done = false; + int i = -1; /* one extra time to return from coroutine */ + + coroutine = qemu_coroutine_create(yield_5_times); + while (!done) { + qemu_coroutine_enter(coroutine, &done); + i++; + } + g_assert_cmpint(i, ==, 5); /* coroutine must yield 5 times */ +} + +/* + * Check that creation, enter, and return work + */ + +static void coroutine_fn set_and_exit(void *opaque) +{ + bool *done = opaque; + + *done = true; +} + +static void test_lifecycle(void) +{ + Coroutine *coroutine; + bool done = false; + + /* Create, enter, and return from coroutine */ + coroutine = qemu_coroutine_create(set_and_exit); + qemu_coroutine_enter(coroutine, &done); + g_assert(done); /* expect done to be true (first time) */ + + /* Repeat to check that no state affects this test */ + done = false; + coroutine = qemu_coroutine_create(set_and_exit); + qemu_coroutine_enter(coroutine, &done); + g_assert(done); /* expect done to be true (second time) */ +} + +int main(int argc, char **argv) +{ + g_test_init(&argc, &argv, NULL); + g_test_add_func("/basic/lifecycle", test_lifecycle); + g_test_add_func("/basic/yield", test_yield); + g_test_add_func("/basic/nesting", test_nesting); + g_test_add_func("/basic/self", test_self); + g_test_add_func("/basic/in_coroutine", test_in_coroutine); + return g_test_run(); +} From 5e3840ce24040cbd1957008489cbc136c43ca391 Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Thu, 12 May 2011 08:27:39 +0100 Subject: [PATCH 19/29] coroutine: add test-coroutine --benchmark-lifecycle Add a microbenchmark for coroutine create, enter, and return (aka lifecycle). This is a useful benchmark because users are expected to create many coroutines, one per I/O request for example, and we therefore need to provide good performance in that scenario. To run: make test-coroutine ./test-coroutine --benchmark-lifecycle 20000000 This will do 20,000,000 coroutine create, enter, return iterations and print the resulting time. Signed-off-by: Stefan Hajnoczi --- test-coroutine.c | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/test-coroutine.c b/test-coroutine.c index 9e9d3c95bc..bf9f3e91b5 100644 --- a/test-coroutine.c +++ b/test-coroutine.c @@ -150,6 +150,33 @@ static void test_lifecycle(void) g_assert(done); /* expect done to be true (second time) */ } +/* + * Lifecycle benchmark + */ + +static void coroutine_fn empty_coroutine(void *opaque) +{ + /* Do nothing */ +} + +static void perf_lifecycle(void) +{ + Coroutine *coroutine; + unsigned int i, max; + double duration; + + max = 1000000; + + g_test_timer_start(); + for (i = 0; i < max; i++) { + coroutine = qemu_coroutine_create(empty_coroutine); + qemu_coroutine_enter(coroutine, NULL); + } + duration = g_test_timer_elapsed(); + + g_test_message("Lifecycle %u iterations: %f s\n", max, duration); +} + int main(int argc, char **argv) { g_test_init(&argc, &argv, NULL); @@ -158,5 +185,8 @@ int main(int argc, char **argv) g_test_add_func("/basic/nesting", test_nesting); g_test_add_func("/basic/self", test_self); g_test_add_func("/basic/in_coroutine", test_in_coroutine); + if (g_test_perf()) { + g_test_add_func("/perf/lifecycle", perf_lifecycle); + } return g_test_run(); } From da1fa91d6cca8a6d3da9c2b222fa485429db297c Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Thu, 14 Jul 2011 17:27:13 +0200 Subject: [PATCH 20/29] block: Add bdrv_co_readv/writev Add new block driver callbacks bdrv_co_readv/writev, which work on a QEMUIOVector like bdrv_aio_*, but don't need a callback. The function may only be called inside a coroutine, so a block driver implementing this interface can yield instead of blocking during I/O. Signed-off-by: Kevin Wolf --- Makefile.objs | 2 +- block.c | 45 +++++++++++++++++++++++++++++++++++++++++++++ block.h | 5 +++++ block_int.h | 6 ++++++ trace-events | 2 ++ 5 files changed, 59 insertions(+), 1 deletion(-) diff --git a/Makefile.objs b/Makefile.objs index 5679e1fa06..9549e2a16f 100644 --- a/Makefile.objs +++ b/Makefile.objs @@ -25,6 +25,7 @@ coroutine-obj-$(CONFIG_WIN32) += coroutine-win32.o block-obj-y = cutils.o cache-utils.o qemu-malloc.o qemu-option.o module.o async.o block-obj-y += nbd.o block.o aio.o aes.o qemu-config.o qemu-progress.o qemu-sockets.o +block-obj-y += $(coroutine-obj-y) block-obj-$(CONFIG_POSIX) += posix-aio-compat.o block-obj-$(CONFIG_LINUX_AIO) += linux-aio.o @@ -79,7 +80,6 @@ common-obj-y += readline.o console.o cursor.o qemu-error.o common-obj-y += $(oslib-obj-y) common-obj-$(CONFIG_WIN32) += os-win32.o common-obj-$(CONFIG_POSIX) += os-posix.o -common-obj-y += $(coroutine-obj-y) common-obj-y += tcg-runtime.o host-utils.o common-obj-y += irq.o ioport.o input.o diff --git a/block.c b/block.c index 4c66b2cf40..1329299ed7 100644 --- a/block.c +++ b/block.c @@ -1110,6 +1110,51 @@ int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset, return 0; } +int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num, + int nb_sectors, QEMUIOVector *qiov) +{ + BlockDriver *drv = bs->drv; + + trace_bdrv_co_readv(bs, sector_num, nb_sectors); + + if (!drv) { + return -ENOMEDIUM; + } + if (bdrv_check_request(bs, sector_num, nb_sectors)) { + return -EIO; + } + + return drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov); +} + +int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num, + int nb_sectors, QEMUIOVector *qiov) +{ + BlockDriver *drv = bs->drv; + + trace_bdrv_co_writev(bs, sector_num, nb_sectors); + + if (!bs->drv) { + return -ENOMEDIUM; + } + if (bs->read_only) { + return -EACCES; + } + if (bdrv_check_request(bs, sector_num, nb_sectors)) { + return -EIO; + } + + if (bs->dirty_bitmap) { + set_dirty_bitmap(bs, sector_num, nb_sectors, 1); + } + + if (bs->wr_highest_sector < sector_num + nb_sectors - 1) { + bs->wr_highest_sector = sector_num + nb_sectors - 1; + } + + return drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov); +} + /** * Truncate file to 'offset' bytes (needed only for file protocols) */ diff --git a/block.h b/block.h index e672bc669a..a3bfaafef0 100644 --- a/block.h +++ b/block.h @@ -4,6 +4,7 @@ #include "qemu-aio.h" #include "qemu-common.h" #include "qemu-option.h" +#include "qemu-coroutine.h" #include "qobject.h" /* block.c */ @@ -85,6 +86,10 @@ int bdrv_pwrite(BlockDriverState *bs, int64_t offset, const void *buf, int count); int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset, const void *buf, int count); +int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num, + int nb_sectors, QEMUIOVector *qiov); +int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num, + int nb_sectors, QEMUIOVector *qiov); int bdrv_truncate(BlockDriverState *bs, int64_t offset); int64_t bdrv_getlength(BlockDriverState *bs); int64_t bdrv_get_allocated_file_size(BlockDriverState *bs); diff --git a/block_int.h b/block_int.h index efefbee289..f6d02b38a7 100644 --- a/block_int.h +++ b/block_int.h @@ -27,6 +27,7 @@ #include "block.h" #include "qemu-option.h" #include "qemu-queue.h" +#include "qemu-coroutine.h" #define BLOCK_FLAG_ENCRYPT 1 #define BLOCK_FLAG_COMPAT6 4 @@ -77,6 +78,11 @@ struct BlockDriver { int (*bdrv_discard)(BlockDriverState *bs, int64_t sector_num, int nb_sectors); + int coroutine_fn (*bdrv_co_readv)(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, QEMUIOVector *qiov); + int coroutine_fn (*bdrv_co_writev)(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, QEMUIOVector *qiov); + int (*bdrv_aio_multiwrite)(BlockDriverState *bs, BlockRequest *reqs, int num_reqs); int (*bdrv_merge_requests)(BlockDriverState *bs, BlockRequest* a, diff --git a/trace-events b/trace-events index 136f7759f2..46bceca1bc 100644 --- a/trace-events +++ b/trace-events @@ -66,6 +66,8 @@ disable bdrv_aio_flush(void *bs, void *opaque) "bs %p opaque %p" disable bdrv_aio_readv(void *bs, int64_t sector_num, int nb_sectors, void *opaque) "bs %p sector_num %"PRId64" nb_sectors %d opaque %p" disable bdrv_aio_writev(void *bs, int64_t sector_num, int nb_sectors, void *opaque) "bs %p sector_num %"PRId64" nb_sectors %d opaque %p" disable bdrv_set_locked(void *bs, int locked) "bs %p locked %d" +disable bdrv_co_readv(void *bs, int64_t sector_num, int nb_sector) "bs %p sector_num %"PRId64" nb_sectors %d" +disable bdrv_co_writev(void *bs, int64_t sector_num, int nb_sector) "bs %p sector_num %"PRId64" nb_sectors %d" # hw/virtio-blk.c disable virtio_blk_req_complete(void *req, int status) "req %p status %d" From 68485420187094c26f86faee5c7f68b5d6a03603 Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Thu, 30 Jun 2011 10:05:46 +0200 Subject: [PATCH 21/29] block: Emulate AIO functions with bdrv_co_readv/writev Use the bdrv_co_readv/writev callbacks to implement bdrv_aio_readv/writev and bdrv_read/write if a driver provides the coroutine version instead of the synchronous or AIO version. Signed-off-by: Kevin Wolf --- block.c | 98 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 97 insertions(+), 1 deletion(-) diff --git a/block.c b/block.c index 1329299ed7..0d973e6e8e 100644 --- a/block.c +++ b/block.c @@ -28,6 +28,7 @@ #include "block_int.h" #include "module.h" #include "qemu-objects.h" +#include "qemu-coroutine.h" #ifdef CONFIG_BSD #include @@ -57,6 +58,12 @@ static int bdrv_read_em(BlockDriverState *bs, int64_t sector_num, uint8_t *buf, int nb_sectors); static int bdrv_write_em(BlockDriverState *bs, int64_t sector_num, const uint8_t *buf, int nb_sectors); +static BlockDriverAIOCB *bdrv_co_aio_readv_em(BlockDriverState *bs, + int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, + BlockDriverCompletionFunc *cb, void *opaque); +static BlockDriverAIOCB *bdrv_co_aio_writev_em(BlockDriverState *bs, + int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, + BlockDriverCompletionFunc *cb, void *opaque); static QTAILQ_HEAD(, BlockDriverState) bdrv_states = QTAILQ_HEAD_INITIALIZER(bdrv_states); @@ -169,7 +176,13 @@ void path_combine(char *dest, int dest_size, void bdrv_register(BlockDriver *bdrv) { - if (!bdrv->bdrv_aio_readv) { + if (bdrv->bdrv_co_readv) { + /* Emulate AIO by coroutines, and sync by AIO */ + bdrv->bdrv_aio_readv = bdrv_co_aio_readv_em; + bdrv->bdrv_aio_writev = bdrv_co_aio_writev_em; + bdrv->bdrv_read = bdrv_read_em; + bdrv->bdrv_write = bdrv_write_em; + } else if (!bdrv->bdrv_aio_readv) { /* add AIO emulation layer */ bdrv->bdrv_aio_readv = bdrv_aio_readv_em; bdrv->bdrv_aio_writev = bdrv_aio_writev_em; @@ -2614,6 +2627,89 @@ static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs, return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1); } + +typedef struct BlockDriverAIOCBCoroutine { + BlockDriverAIOCB common; + BlockRequest req; + bool is_write; + QEMUBH* bh; +} BlockDriverAIOCBCoroutine; + +static void bdrv_aio_co_cancel_em(BlockDriverAIOCB *blockacb) +{ + qemu_aio_flush(); +} + +static AIOPool bdrv_em_co_aio_pool = { + .aiocb_size = sizeof(BlockDriverAIOCBCoroutine), + .cancel = bdrv_aio_co_cancel_em, +}; + +static void bdrv_co_rw_bh(void *opaque) +{ + BlockDriverAIOCBCoroutine *acb = opaque; + + acb->common.cb(acb->common.opaque, acb->req.error); + qemu_bh_delete(acb->bh); + qemu_aio_release(acb); +} + +static void coroutine_fn bdrv_co_rw(void *opaque) +{ + BlockDriverAIOCBCoroutine *acb = opaque; + BlockDriverState *bs = acb->common.bs; + + if (!acb->is_write) { + acb->req.error = bs->drv->bdrv_co_readv(bs, acb->req.sector, + acb->req.nb_sectors, acb->req.qiov); + } else { + acb->req.error = bs->drv->bdrv_co_writev(bs, acb->req.sector, + acb->req.nb_sectors, acb->req.qiov); + } + + acb->bh = qemu_bh_new(bdrv_co_rw_bh, acb); + qemu_bh_schedule(acb->bh); +} + +static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs, + int64_t sector_num, + QEMUIOVector *qiov, + int nb_sectors, + BlockDriverCompletionFunc *cb, + void *opaque, + bool is_write) +{ + Coroutine *co; + BlockDriverAIOCBCoroutine *acb; + + acb = qemu_aio_get(&bdrv_em_co_aio_pool, bs, cb, opaque); + acb->req.sector = sector_num; + acb->req.nb_sectors = nb_sectors; + acb->req.qiov = qiov; + acb->is_write = is_write; + + co = qemu_coroutine_create(bdrv_co_rw); + qemu_coroutine_enter(co, acb); + + return &acb->common; +} + +static BlockDriverAIOCB *bdrv_co_aio_readv_em(BlockDriverState *bs, + int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, + BlockDriverCompletionFunc *cb, void *opaque) +{ + return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, + false); +} + +static BlockDriverAIOCB *bdrv_co_aio_writev_em(BlockDriverState *bs, + int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, + BlockDriverCompletionFunc *cb, void *opaque) +{ + return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, + true); +} + static BlockDriverAIOCB *bdrv_aio_flush_em(BlockDriverState *bs, BlockDriverCompletionFunc *cb, void *opaque) { From f9f05dc58c50d19ad762e6c1ce6b5def9814a4ed Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Fri, 15 Jul 2011 13:50:26 +0200 Subject: [PATCH 22/29] block: Add bdrv_co_readv/writev emulation In order to be able to call bdrv_co_readv/writev for drivers that don't implement the functions natively, add an emulation that uses the AIO functions to implement them. Signed-off-by: Kevin Wolf --- block.c | 83 +++++++++++++++++++++++++++++++++++++++++++++++----- trace-events | 1 + 2 files changed, 76 insertions(+), 8 deletions(-) diff --git a/block.c b/block.c index 0d973e6e8e..e6abea85df 100644 --- a/block.c +++ b/block.c @@ -64,6 +64,12 @@ static BlockDriverAIOCB *bdrv_co_aio_readv_em(BlockDriverState *bs, static BlockDriverAIOCB *bdrv_co_aio_writev_em(BlockDriverState *bs, int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, BlockDriverCompletionFunc *cb, void *opaque); +static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, + QEMUIOVector *iov); +static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, + QEMUIOVector *iov); static QTAILQ_HEAD(, BlockDriverState) bdrv_states = QTAILQ_HEAD_INITIALIZER(bdrv_states); @@ -182,14 +188,19 @@ void bdrv_register(BlockDriver *bdrv) bdrv->bdrv_aio_writev = bdrv_co_aio_writev_em; bdrv->bdrv_read = bdrv_read_em; bdrv->bdrv_write = bdrv_write_em; - } else if (!bdrv->bdrv_aio_readv) { - /* add AIO emulation layer */ - bdrv->bdrv_aio_readv = bdrv_aio_readv_em; - bdrv->bdrv_aio_writev = bdrv_aio_writev_em; - } else if (!bdrv->bdrv_read) { - /* add synchronous IO emulation layer */ - bdrv->bdrv_read = bdrv_read_em; - bdrv->bdrv_write = bdrv_write_em; + } else { + bdrv->bdrv_co_readv = bdrv_co_readv_em; + bdrv->bdrv_co_writev = bdrv_co_writev_em; + + if (!bdrv->bdrv_aio_readv) { + /* add AIO emulation layer */ + bdrv->bdrv_aio_readv = bdrv_aio_readv_em; + bdrv->bdrv_aio_writev = bdrv_aio_writev_em; + } else if (!bdrv->bdrv_read) { + /* add synchronous IO emulation layer */ + bdrv->bdrv_read = bdrv_read_em; + bdrv->bdrv_write = bdrv_write_em; + } } if (!bdrv->bdrv_aio_flush) @@ -2855,6 +2866,62 @@ void qemu_aio_release(void *p) pool->free_aiocb = acb; } +/**************************************************************/ +/* Coroutine block device emulation */ + +typedef struct CoroutineIOCompletion { + Coroutine *coroutine; + int ret; +} CoroutineIOCompletion; + +static void bdrv_co_io_em_complete(void *opaque, int ret) +{ + CoroutineIOCompletion *co = opaque; + + co->ret = ret; + qemu_coroutine_enter(co->coroutine, NULL); +} + +static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num, + int nb_sectors, QEMUIOVector *iov, + bool is_write) +{ + CoroutineIOCompletion co = { + .coroutine = qemu_coroutine_self(), + }; + BlockDriverAIOCB *acb; + + if (is_write) { + acb = bdrv_aio_writev(bs, sector_num, iov, nb_sectors, + bdrv_co_io_em_complete, &co); + } else { + acb = bdrv_aio_readv(bs, sector_num, iov, nb_sectors, + bdrv_co_io_em_complete, &co); + } + + trace_bdrv_co_io(is_write, acb); + if (!acb) { + return -EIO; + } + qemu_coroutine_yield(); + + return co.ret; +} + +static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, + QEMUIOVector *iov) +{ + return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false); +} + +static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, + QEMUIOVector *iov) +{ + return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true); +} + /**************************************************************/ /* removable device support */ diff --git a/trace-events b/trace-events index 46bceca1bc..bc9be3010b 100644 --- a/trace-events +++ b/trace-events @@ -68,6 +68,7 @@ disable bdrv_aio_writev(void *bs, int64_t sector_num, int nb_sectors, void *opaq disable bdrv_set_locked(void *bs, int locked) "bs %p locked %d" disable bdrv_co_readv(void *bs, int64_t sector_num, int nb_sector) "bs %p sector_num %"PRId64" nb_sectors %d" disable bdrv_co_writev(void *bs, int64_t sector_num, int nb_sector) "bs %p sector_num %"PRId64" nb_sectors %d" +disable bdrv_co_io(int is_write, void *acb) "is_write %d acb %p" # hw/virtio-blk.c disable virtio_blk_req_complete(void *req, int status) "req %p status %d" From b96e92470ab4a87268e8b174602eaea6c508003b Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Thu, 30 Jun 2011 17:56:46 +0200 Subject: [PATCH 23/29] coroutines: Locks Signed-off-by: Kevin Wolf --- Makefile.objs | 2 +- qemu-coroutine-int.h | 1 + qemu-coroutine-lock.c | 124 ++++++++++++++++++++++++++++++++++++++++++ qemu-coroutine.h | 64 ++++++++++++++++++++++ trace-events | 8 +++ 5 files changed, 198 insertions(+), 1 deletion(-) create mode 100644 qemu-coroutine-lock.c diff --git a/Makefile.objs b/Makefile.objs index 9549e2a16f..89ca3611b3 100644 --- a/Makefile.objs +++ b/Makefile.objs @@ -12,7 +12,7 @@ oslib-obj-$(CONFIG_POSIX) += oslib-posix.o qemu-thread-posix.o ####################################################################### # coroutines -coroutine-obj-y = qemu-coroutine.o +coroutine-obj-y = qemu-coroutine.o qemu-coroutine-lock.o ifeq ($(CONFIG_UCONTEXT_COROUTINE),y) coroutine-obj-$(CONFIG_POSIX) += coroutine-ucontext.o else diff --git a/qemu-coroutine-int.h b/qemu-coroutine-int.h index 64915c2fa5..d495615cf6 100644 --- a/qemu-coroutine-int.h +++ b/qemu-coroutine-int.h @@ -38,6 +38,7 @@ struct Coroutine { void *entry_arg; Coroutine *caller; QLIST_ENTRY(Coroutine) pool_next; + QTAILQ_ENTRY(Coroutine) co_queue_next; }; Coroutine *qemu_coroutine_new(void); diff --git a/qemu-coroutine-lock.c b/qemu-coroutine-lock.c new file mode 100644 index 0000000000..abaa1f7967 --- /dev/null +++ b/qemu-coroutine-lock.c @@ -0,0 +1,124 @@ +/* + * coroutine queues and locks + * + * Copyright (c) 2011 Kevin Wolf + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "qemu-common.h" +#include "qemu-coroutine.h" +#include "qemu-coroutine-int.h" +#include "qemu-queue.h" +#include "trace.h" + +static QTAILQ_HEAD(, Coroutine) unlock_bh_queue = + QTAILQ_HEAD_INITIALIZER(unlock_bh_queue); + +struct unlock_bh { + QEMUBH *bh; +}; + +static void qemu_co_queue_next_bh(void *opaque) +{ + struct unlock_bh *unlock_bh = opaque; + Coroutine *next; + + trace_qemu_co_queue_next_bh(); + while ((next = QTAILQ_FIRST(&unlock_bh_queue))) { + QTAILQ_REMOVE(&unlock_bh_queue, next, co_queue_next); + qemu_coroutine_enter(next, NULL); + } + + qemu_bh_delete(unlock_bh->bh); + qemu_free(unlock_bh); +} + +void qemu_co_queue_init(CoQueue *queue) +{ + QTAILQ_INIT(&queue->entries); +} + +void coroutine_fn qemu_co_queue_wait(CoQueue *queue) +{ + Coroutine *self = qemu_coroutine_self(); + QTAILQ_INSERT_TAIL(&queue->entries, self, co_queue_next); + qemu_coroutine_yield(); + assert(qemu_in_coroutine()); +} + +bool qemu_co_queue_next(CoQueue *queue) +{ + struct unlock_bh *unlock_bh; + Coroutine *next; + + next = QTAILQ_FIRST(&queue->entries); + if (next) { + QTAILQ_REMOVE(&queue->entries, next, co_queue_next); + QTAILQ_INSERT_TAIL(&unlock_bh_queue, next, co_queue_next); + trace_qemu_co_queue_next(next); + + unlock_bh = qemu_malloc(sizeof(*unlock_bh)); + unlock_bh->bh = qemu_bh_new(qemu_co_queue_next_bh, unlock_bh); + qemu_bh_schedule(unlock_bh->bh); + } + + return (next != NULL); +} + +bool qemu_co_queue_empty(CoQueue *queue) +{ + return (QTAILQ_FIRST(&queue->entries) == NULL); +} + +void qemu_co_mutex_init(CoMutex *mutex) +{ + memset(mutex, 0, sizeof(*mutex)); + qemu_co_queue_init(&mutex->queue); +} + +void coroutine_fn qemu_co_mutex_lock(CoMutex *mutex) +{ + Coroutine *self = qemu_coroutine_self(); + + trace_qemu_co_mutex_lock_entry(mutex, self); + + while (mutex->locked) { + qemu_co_queue_wait(&mutex->queue); + } + + mutex->locked = true; + + trace_qemu_co_mutex_lock_return(mutex, self); +} + +void coroutine_fn qemu_co_mutex_unlock(CoMutex *mutex) +{ + Coroutine *self = qemu_coroutine_self(); + + trace_qemu_co_mutex_unlock_entry(mutex, self); + + assert(mutex->locked == true); + assert(qemu_in_coroutine()); + + mutex->locked = false; + qemu_co_queue_next(&mutex->queue); + + trace_qemu_co_mutex_unlock_return(mutex, self); +} diff --git a/qemu-coroutine.h b/qemu-coroutine.h index 08255c7c41..2f2fd95552 100644 --- a/qemu-coroutine.h +++ b/qemu-coroutine.h @@ -5,6 +5,7 @@ * * Authors: * Stefan Hajnoczi + * Kevin Wolf * * This work is licensed under the terms of the GNU LGPL, version 2 or later. * See the COPYING.LIB file in the top-level directory. @@ -15,6 +16,7 @@ #define QEMU_COROUTINE_H #include +#include "qemu-queue.h" /** * Coroutines are a mechanism for stack switching and can be used for @@ -92,4 +94,66 @@ Coroutine *coroutine_fn qemu_coroutine_self(void); */ bool qemu_in_coroutine(void); + + +/** + * CoQueues are a mechanism to queue coroutines in order to continue executing + * them later. They provide the fundamental primitives on which coroutine locks + * are built. + */ +typedef struct CoQueue { + QTAILQ_HEAD(, Coroutine) entries; +} CoQueue; + +/** + * Initialise a CoQueue. This must be called before any other operation is used + * on the CoQueue. + */ +void qemu_co_queue_init(CoQueue *queue); + +/** + * Adds the current coroutine to the CoQueue and transfers control to the + * caller of the coroutine. + */ +void coroutine_fn qemu_co_queue_wait(CoQueue *queue); + +/** + * Restarts the next coroutine in the CoQueue and removes it from the queue. + * + * Returns true if a coroutine was restarted, false if the queue is empty. + */ +bool qemu_co_queue_next(CoQueue *queue); + +/** + * Checks if the CoQueue is empty. + */ +bool qemu_co_queue_empty(CoQueue *queue); + + +/** + * Provides a mutex that can be used to synchronise coroutines + */ +typedef struct CoMutex { + bool locked; + CoQueue queue; +} CoMutex; + +/** + * Initialises a CoMutex. This must be called before any other operation is used + * on the CoMutex. + */ +void qemu_co_mutex_init(CoMutex *mutex); + +/** + * Locks the mutex. If the lock cannot be taken immediately, control is + * transferred to the caller of the current coroutine. + */ +void coroutine_fn qemu_co_mutex_lock(CoMutex *mutex); + +/** + * Unlocks the mutex and schedules the next coroutine that was waiting for this + * lock to be run. + */ +void coroutine_fn qemu_co_mutex_unlock(CoMutex *mutex); + #endif /* QEMU_COROUTINE_H */ diff --git a/trace-events b/trace-events index bc9be3010b..19d31e3541 100644 --- a/trace-events +++ b/trace-events @@ -433,3 +433,11 @@ disable xen_platform_log(char *s) "xen platform: %s" disable qemu_coroutine_enter(void *from, void *to, void *opaque) "from %p to %p opaque %p" disable qemu_coroutine_yield(void *from, void *to) "from %p to %p" disable qemu_coroutine_terminate(void *co) "self %p" + +# qemu-coroutine-lock.c +disable qemu_co_queue_next_bh(void) "" +disable qemu_co_queue_next(void *next) "next %p" +disable qemu_co_mutex_lock_entry(void *mutex, void *self) "mutex %p self %p" +disable qemu_co_mutex_lock_return(void *mutex, void *self) "mutex %p self %p" +disable qemu_co_mutex_unlock_entry(void *mutex, void *self) "mutex %p self %p" +disable qemu_co_mutex_unlock_return(void *mutex, void *self) "mutex %p self %p" From 68d100e905453ebbeea8e915f4f18a2bd4339fe8 Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Thu, 30 Jun 2011 17:42:09 +0200 Subject: [PATCH 24/29] qcow2: Use coroutines Signed-off-by: Kevin Wolf --- block/qcow2-cluster.c | 26 +++-- block/qcow2.c | 240 +++++++++++++++--------------------------- block/qcow2.h | 5 +- 3 files changed, 102 insertions(+), 169 deletions(-) diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c index 882f50a80b..81cf77d83c 100644 --- a/block/qcow2-cluster.c +++ b/block/qcow2-cluster.c @@ -697,12 +697,12 @@ err: * m->depends_on is set to NULL and the other fields in m are meaningless. * * If the cluster is newly allocated, m->nb_clusters is set to the number of - * contiguous clusters that have been allocated. This may be 0 if the request - * conflict with another write request in flight; in this case, m->depends_on - * is set and the remaining fields of m are meaningless. + * contiguous clusters that have been allocated. In this case, the other + * fields of m are valid and contain information about the first allocated + * cluster. * - * If m->nb_clusters is non-zero, the other fields of m are valid and contain - * information about the first allocated cluster. + * If the request conflicts with another write request in flight, the coroutine + * is queued and will be reentered when the dependency has completed. * * Return 0 on success and -errno in error cases */ @@ -721,6 +721,7 @@ int qcow2_alloc_cluster_offset(BlockDriverState *bs, uint64_t offset, return ret; } +again: nb_clusters = size_to_clusters(s, n_end << 9); nb_clusters = MIN(nb_clusters, s->l2_size - l2_index); @@ -792,12 +793,12 @@ int qcow2_alloc_cluster_offset(BlockDriverState *bs, uint64_t offset, } if (nb_clusters == 0) { - /* Set dependency and wait for a callback */ - m->depends_on = old_alloc; - m->nb_clusters = 0; - *num = 0; - - goto out_wait_dependency; + /* Wait for the dependency to complete. We need to recheck + * the free/allocated clusters when we continue. */ + qemu_co_mutex_unlock(&s->lock); + qemu_co_queue_wait(&old_alloc->dependent_requests); + qemu_co_mutex_lock(&s->lock); + goto again; } } } @@ -834,9 +835,6 @@ out: return 0; -out_wait_dependency: - return qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table); - fail: qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table); fail_put: diff --git a/block/qcow2.c b/block/qcow2.c index 48e1b95689..f07d550a96 100644 --- a/block/qcow2.c +++ b/block/qcow2.c @@ -276,6 +276,9 @@ static int qcow2_open(BlockDriverState *bs, int flags) goto fail; } + /* Initialise locks */ + qemu_co_mutex_init(&s->lock); + #ifdef DEBUG_ALLOC qcow2_check_refcounts(bs); #endif @@ -379,7 +382,6 @@ typedef struct QCowAIOCB { uint64_t cluster_offset; uint8_t *cluster_data; bool is_write; - BlockDriverAIOCB *hd_aiocb; QEMUIOVector hd_qiov; QEMUBH *bh; QCowL2Meta l2meta; @@ -389,8 +391,6 @@ typedef struct QCowAIOCB { static void qcow2_aio_cancel(BlockDriverAIOCB *blockacb) { QCowAIOCB *acb = container_of(blockacb, QCowAIOCB, common); - if (acb->hd_aiocb) - bdrv_aio_cancel(acb->hd_aiocb); qemu_aio_release(acb); } @@ -399,46 +399,16 @@ static AIOPool qcow2_aio_pool = { .cancel = qcow2_aio_cancel, }; -static void qcow2_aio_read_cb(void *opaque, int ret); -static void qcow2_aio_write_cb(void *opaque, int ret); - -static void qcow2_aio_rw_bh(void *opaque) +/* + * Returns 0 when the request is completed successfully, 1 when there is still + * a part left to do and -errno in error cases. + */ +static int qcow2_aio_read_cb(QCowAIOCB *acb) { - QCowAIOCB *acb = opaque; - qemu_bh_delete(acb->bh); - acb->bh = NULL; - - if (acb->is_write) { - qcow2_aio_write_cb(opaque, 0); - } else { - qcow2_aio_read_cb(opaque, 0); - } -} - -static int qcow2_schedule_bh(QEMUBHFunc *cb, QCowAIOCB *acb) -{ - if (acb->bh) - return -EIO; - - acb->bh = qemu_bh_new(cb, acb); - if (!acb->bh) - return -EIO; - - qemu_bh_schedule(acb->bh); - - return 0; -} - -static void qcow2_aio_read_cb(void *opaque, int ret) -{ - QCowAIOCB *acb = opaque; BlockDriverState *bs = acb->common.bs; BDRVQcowState *s = bs->opaque; int index_in_cluster, n1; - - acb->hd_aiocb = NULL; - if (ret < 0) - goto done; + int ret; /* post process the read buffer */ if (!acb->cluster_offset) { @@ -463,8 +433,7 @@ static void qcow2_aio_read_cb(void *opaque, int ret) if (acb->remaining_sectors == 0) { /* request completed */ - ret = 0; - goto done; + return 0; } /* prepare next AIO request */ @@ -477,7 +446,7 @@ static void qcow2_aio_read_cb(void *opaque, int ret) ret = qcow2_get_cluster_offset(bs, acb->sector_num << 9, &acb->cur_nr_sectors, &acb->cluster_offset); if (ret < 0) { - goto done; + return ret; } index_in_cluster = acb->sector_num & (s->cluster_sectors - 1); @@ -494,42 +463,35 @@ static void qcow2_aio_read_cb(void *opaque, int ret) acb->sector_num, acb->cur_nr_sectors); if (n1 > 0) { BLKDBG_EVENT(bs->file, BLKDBG_READ_BACKING_AIO); - acb->hd_aiocb = bdrv_aio_readv(bs->backing_hd, acb->sector_num, - &acb->hd_qiov, n1, qcow2_aio_read_cb, acb); - if (acb->hd_aiocb == NULL) { - ret = -EIO; - goto done; + qemu_co_mutex_unlock(&s->lock); + ret = bdrv_co_readv(bs->backing_hd, acb->sector_num, + n1, &acb->hd_qiov); + qemu_co_mutex_lock(&s->lock); + if (ret < 0) { + return ret; } - } else { - ret = qcow2_schedule_bh(qcow2_aio_rw_bh, acb); - if (ret < 0) - goto done; } + return 1; } else { /* Note: in this case, no need to wait */ qemu_iovec_memset(&acb->hd_qiov, 0, 512 * acb->cur_nr_sectors); - ret = qcow2_schedule_bh(qcow2_aio_rw_bh, acb); - if (ret < 0) - goto done; + return 1; } } else if (acb->cluster_offset & QCOW_OFLAG_COMPRESSED) { /* add AIO support for compressed blocks ? */ ret = qcow2_decompress_cluster(bs, acb->cluster_offset); if (ret < 0) { - goto done; + return ret; } qemu_iovec_from_buffer(&acb->hd_qiov, s->cluster_cache + index_in_cluster * 512, 512 * acb->cur_nr_sectors); - ret = qcow2_schedule_bh(qcow2_aio_rw_bh, acb); - if (ret < 0) - goto done; + return 1; } else { if ((acb->cluster_offset & 511) != 0) { - ret = -EIO; - goto done; + return -EIO; } if (s->crypt_method) { @@ -550,21 +512,17 @@ static void qcow2_aio_read_cb(void *opaque, int ret) } BLKDBG_EVENT(bs->file, BLKDBG_READ_AIO); - acb->hd_aiocb = bdrv_aio_readv(bs->file, + qemu_co_mutex_unlock(&s->lock); + ret = bdrv_co_readv(bs->file, (acb->cluster_offset >> 9) + index_in_cluster, - &acb->hd_qiov, acb->cur_nr_sectors, - qcow2_aio_read_cb, acb); - if (acb->hd_aiocb == NULL) { - ret = -EIO; - goto done; + acb->cur_nr_sectors, &acb->hd_qiov); + qemu_co_mutex_lock(&s->lock); + if (ret < 0) { + return ret; } } - return; -done: - acb->common.cb(acb->common.opaque, ret); - qemu_iovec_destroy(&acb->hd_qiov); - qemu_aio_release(acb); + return 1; } static QCowAIOCB *qcow2_aio_setup(BlockDriverState *bs, int64_t sector_num, @@ -577,7 +535,6 @@ static QCowAIOCB *qcow2_aio_setup(BlockDriverState *bs, int64_t sector_num, acb = qemu_aio_get(&qcow2_aio_pool, bs, cb, opaque); if (!acb) return NULL; - acb->hd_aiocb = NULL; acb->sector_num = sector_num; acb->qiov = qiov; acb->is_write = is_write; @@ -589,79 +546,73 @@ static QCowAIOCB *qcow2_aio_setup(BlockDriverState *bs, int64_t sector_num, acb->cur_nr_sectors = 0; acb->cluster_offset = 0; acb->l2meta.nb_clusters = 0; - QLIST_INIT(&acb->l2meta.dependent_requests); + qemu_co_queue_init(&acb->l2meta.dependent_requests); return acb; } -static BlockDriverAIOCB *qcow2_aio_readv(BlockDriverState *bs, - int64_t sector_num, - QEMUIOVector *qiov, int nb_sectors, - BlockDriverCompletionFunc *cb, - void *opaque) +static int qcow2_co_readv(BlockDriverState *bs, int64_t sector_num, + int nb_sectors, QEMUIOVector *qiov) { + BDRVQcowState *s = bs->opaque; QCowAIOCB *acb; int ret; - acb = qcow2_aio_setup(bs, sector_num, qiov, nb_sectors, cb, opaque, 0); - if (!acb) - return NULL; + acb = qcow2_aio_setup(bs, sector_num, qiov, nb_sectors, NULL, NULL, 0); - ret = qcow2_schedule_bh(qcow2_aio_rw_bh, acb); - if (ret < 0) { - qemu_iovec_destroy(&acb->hd_qiov); - qemu_aio_release(acb); - return NULL; - } + qemu_co_mutex_lock(&s->lock); + do { + ret = qcow2_aio_read_cb(acb); + } while (ret > 0); + qemu_co_mutex_unlock(&s->lock); - return &acb->common; + qemu_iovec_destroy(&acb->hd_qiov); + qemu_aio_release(acb); + + return ret; } -static void run_dependent_requests(QCowL2Meta *m) +static void run_dependent_requests(BDRVQcowState *s, QCowL2Meta *m) { - QCowAIOCB *req; - QCowAIOCB *next; - /* Take the request off the list of running requests */ if (m->nb_clusters != 0) { QLIST_REMOVE(m, next_in_flight); } /* Restart all dependent requests */ - QLIST_FOREACH_SAFE(req, &m->dependent_requests, next_depend, next) { - qcow2_aio_write_cb(req, 0); + if (!qemu_co_queue_empty(&m->dependent_requests)) { + qemu_co_mutex_unlock(&s->lock); + while(qemu_co_queue_next(&m->dependent_requests)); + qemu_co_mutex_lock(&s->lock); } - - /* Empty the list for the next part of the request */ - QLIST_INIT(&m->dependent_requests); } -static void qcow2_aio_write_cb(void *opaque, int ret) +/* + * Returns 0 when the request is completed successfully, 1 when there is still + * a part left to do and -errno in error cases. + */ +static int qcow2_aio_write_cb(QCowAIOCB *acb) { - QCowAIOCB *acb = opaque; BlockDriverState *bs = acb->common.bs; BDRVQcowState *s = bs->opaque; int index_in_cluster; int n_end; + int ret; - acb->hd_aiocb = NULL; + ret = qcow2_alloc_cluster_link_l2(bs, &acb->l2meta); - if (ret >= 0) { - ret = qcow2_alloc_cluster_link_l2(bs, &acb->l2meta); + run_dependent_requests(s, &acb->l2meta); + + if (ret < 0) { + return ret; } - run_dependent_requests(&acb->l2meta); - - if (ret < 0) - goto done; - acb->remaining_sectors -= acb->cur_nr_sectors; acb->sector_num += acb->cur_nr_sectors; acb->bytes_done += acb->cur_nr_sectors * 512; if (acb->remaining_sectors == 0) { /* request completed */ - ret = 0; - goto done; + return 0; } index_in_cluster = acb->sector_num & (s->cluster_sectors - 1); @@ -673,18 +624,10 @@ static void qcow2_aio_write_cb(void *opaque, int ret) ret = qcow2_alloc_cluster_offset(bs, acb->sector_num << 9, index_in_cluster, n_end, &acb->cur_nr_sectors, &acb->l2meta); if (ret < 0) { - goto done; + return ret; } acb->cluster_offset = acb->l2meta.cluster_offset; - - /* Need to wait for another request? If so, we are done for now. */ - if (acb->l2meta.nb_clusters == 0 && acb->l2meta.depends_on != NULL) { - QLIST_INSERT_HEAD(&acb->l2meta.depends_on->dependent_requests, - acb, next_depend); - return; - } - assert((acb->cluster_offset & 511) == 0); qemu_iovec_reset(&acb->hd_qiov); @@ -709,51 +652,40 @@ static void qcow2_aio_write_cb(void *opaque, int ret) } BLKDBG_EVENT(bs->file, BLKDBG_WRITE_AIO); - acb->hd_aiocb = bdrv_aio_writev(bs->file, - (acb->cluster_offset >> 9) + index_in_cluster, - &acb->hd_qiov, acb->cur_nr_sectors, - qcow2_aio_write_cb, acb); - if (acb->hd_aiocb == NULL) { - ret = -EIO; - goto fail; + qemu_co_mutex_unlock(&s->lock); + ret = bdrv_co_writev(bs->file, + (acb->cluster_offset >> 9) + index_in_cluster, + acb->cur_nr_sectors, &acb->hd_qiov); + qemu_co_mutex_lock(&s->lock); + if (ret < 0) { + return ret; } - return; - -fail: - if (acb->l2meta.nb_clusters != 0) { - QLIST_REMOVE(&acb->l2meta, next_in_flight); - } -done: - acb->common.cb(acb->common.opaque, ret); - qemu_iovec_destroy(&acb->hd_qiov); - qemu_aio_release(acb); + return 1; } -static BlockDriverAIOCB *qcow2_aio_writev(BlockDriverState *bs, - int64_t sector_num, - QEMUIOVector *qiov, int nb_sectors, - BlockDriverCompletionFunc *cb, - void *opaque) +static int qcow2_co_writev(BlockDriverState *bs, + int64_t sector_num, + int nb_sectors, + QEMUIOVector *qiov) { BDRVQcowState *s = bs->opaque; QCowAIOCB *acb; int ret; + acb = qcow2_aio_setup(bs, sector_num, qiov, nb_sectors, NULL, NULL, 1); s->cluster_cache_offset = -1; /* disable compressed cache */ - acb = qcow2_aio_setup(bs, sector_num, qiov, nb_sectors, cb, opaque, 1); - if (!acb) - return NULL; + qemu_co_mutex_lock(&s->lock); + do { + ret = qcow2_aio_write_cb(acb); + } while (ret > 0); + qemu_co_mutex_unlock(&s->lock); - ret = qcow2_schedule_bh(qcow2_aio_rw_bh, acb); - if (ret < 0) { - qemu_iovec_destroy(&acb->hd_qiov); - qemu_aio_release(acb); - return NULL; - } + qemu_iovec_destroy(&acb->hd_qiov); + qemu_aio_release(acb); - return &acb->common; + return ret; } static void qcow2_close(BlockDriverState *bs) @@ -881,7 +813,7 @@ static int preallocate(BlockDriverState *bs) nb_sectors = bdrv_getlength(bs) >> 9; offset = 0; - QLIST_INIT(&meta.dependent_requests); + qemu_co_queue_init(&meta.dependent_requests); meta.cluster_offset = 0; while (nb_sectors) { @@ -899,7 +831,7 @@ static int preallocate(BlockDriverState *bs) /* There are no dependent requests, but we need to remove our request * from the list of in-flight requests */ - run_dependent_requests(&meta); + run_dependent_requests(bs->opaque, &meta); /* TODO Preallocate data if requested */ @@ -1387,8 +1319,8 @@ static BlockDriver bdrv_qcow2 = { .bdrv_set_key = qcow2_set_key, .bdrv_make_empty = qcow2_make_empty, - .bdrv_aio_readv = qcow2_aio_readv, - .bdrv_aio_writev = qcow2_aio_writev, + .bdrv_co_readv = qcow2_co_readv, + .bdrv_co_writev = qcow2_co_writev, .bdrv_aio_flush = qcow2_aio_flush, .bdrv_discard = qcow2_discard, diff --git a/block/qcow2.h b/block/qcow2.h index 6a0a21b694..de23abe1a4 100644 --- a/block/qcow2.h +++ b/block/qcow2.h @@ -26,6 +26,7 @@ #define BLOCK_QCOW2_H #include "aes.h" +#include "qemu-coroutine.h" //#define DEBUG_ALLOC //#define DEBUG_ALLOC2 @@ -114,6 +115,8 @@ typedef struct BDRVQcowState { int64_t free_cluster_index; int64_t free_byte_offset; + CoMutex lock; + uint32_t crypt_method; /* current crypt method, 0 if no key yet */ uint32_t crypt_method_header; AES_KEY aes_encrypt_key; @@ -146,7 +149,7 @@ typedef struct QCowL2Meta int nb_available; int nb_clusters; struct QCowL2Meta *depends_on; - QLIST_HEAD(QCowAioDependencies, QCowAIOCB) dependent_requests; + CoQueue dependent_requests; QLIST_ENTRY(QCowL2Meta) next_in_flight; } QCowL2Meta; From 52b8eb60132b27ad53476490e9d7579003390cfa Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Fri, 15 Jul 2011 16:27:42 +0200 Subject: [PATCH 25/29] qcow: Use coroutines The old qcow format is another user of the AsyncContext infrastructure. Converting it to coroutines (and therefore CoMutexes) allows to remove AsyncContexts. Signed-off-by: Kevin Wolf --- block/qcow.c | 186 ++++++++++++++++++--------------------------------- 1 file changed, 65 insertions(+), 121 deletions(-) diff --git a/block/qcow.c b/block/qcow.c index 227b104e36..6447c2a1c0 100644 --- a/block/qcow.c +++ b/block/qcow.c @@ -73,6 +73,7 @@ typedef struct BDRVQcowState { uint32_t crypt_method_header; AES_KEY aes_encrypt_key; AES_KEY aes_decrypt_key; + CoMutex lock; } BDRVQcowState; static int decompress_cluster(BlockDriverState *bs, uint64_t cluster_offset); @@ -517,11 +518,11 @@ static AIOPool qcow_aio_pool = { static QCowAIOCB *qcow_aio_setup(BlockDriverState *bs, int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, - BlockDriverCompletionFunc *cb, void *opaque, int is_write) + int is_write) { QCowAIOCB *acb; - acb = qemu_aio_get(&qcow_aio_pool, bs, cb, opaque); + acb = qemu_aio_get(&qcow_aio_pool, bs, NULL, NULL); if (!acb) return NULL; acb->hd_aiocb = NULL; @@ -542,48 +543,15 @@ static QCowAIOCB *qcow_aio_setup(BlockDriverState *bs, return acb; } -static void qcow_aio_read_cb(void *opaque, int ret); -static void qcow_aio_write_cb(void *opaque, int ret); - -static void qcow_aio_rw_bh(void *opaque) -{ - QCowAIOCB *acb = opaque; - qemu_bh_delete(acb->bh); - acb->bh = NULL; - - if (acb->is_write) { - qcow_aio_write_cb(opaque, 0); - } else { - qcow_aio_read_cb(opaque, 0); - } -} - -static int qcow_schedule_bh(QEMUBHFunc *cb, QCowAIOCB *acb) -{ - if (acb->bh) { - return -EIO; - } - - acb->bh = qemu_bh_new(cb, acb); - if (!acb->bh) { - return -EIO; - } - - qemu_bh_schedule(acb->bh); - - return 0; -} - -static void qcow_aio_read_cb(void *opaque, int ret) +static int qcow_aio_read_cb(void *opaque) { QCowAIOCB *acb = opaque; BlockDriverState *bs = acb->common.bs; BDRVQcowState *s = bs->opaque; int index_in_cluster; + int ret; acb->hd_aiocb = NULL; - if (ret < 0) - goto done; redo: /* post process the read buffer */ @@ -605,8 +573,7 @@ static void qcow_aio_read_cb(void *opaque, int ret) if (acb->nb_sectors == 0) { /* request completed */ - ret = 0; - goto done; + return 0; } /* prepare next AIO request */ @@ -623,11 +590,12 @@ static void qcow_aio_read_cb(void *opaque, int ret) acb->hd_iov.iov_base = (void *)acb->buf; acb->hd_iov.iov_len = acb->n * 512; qemu_iovec_init_external(&acb->hd_qiov, &acb->hd_iov, 1); - acb->hd_aiocb = bdrv_aio_readv(bs->backing_hd, acb->sector_num, - &acb->hd_qiov, acb->n, qcow_aio_read_cb, acb); - if (acb->hd_aiocb == NULL) { - ret = -EIO; - goto done; + qemu_co_mutex_unlock(&s->lock); + ret = bdrv_co_readv(bs->backing_hd, acb->sector_num, + acb->n, &acb->hd_qiov); + qemu_co_mutex_lock(&s->lock); + if (ret < 0) { + return -EIO; } } else { /* Note: in this case, no need to wait */ @@ -637,64 +605,56 @@ static void qcow_aio_read_cb(void *opaque, int ret) } else if (acb->cluster_offset & QCOW_OFLAG_COMPRESSED) { /* add AIO support for compressed blocks ? */ if (decompress_cluster(bs, acb->cluster_offset) < 0) { - ret = -EIO; - goto done; + return -EIO; } memcpy(acb->buf, s->cluster_cache + index_in_cluster * 512, 512 * acb->n); goto redo; } else { if ((acb->cluster_offset & 511) != 0) { - ret = -EIO; - goto done; + return -EIO; } acb->hd_iov.iov_base = (void *)acb->buf; acb->hd_iov.iov_len = acb->n * 512; qemu_iovec_init_external(&acb->hd_qiov, &acb->hd_iov, 1); - acb->hd_aiocb = bdrv_aio_readv(bs->file, + qemu_co_mutex_unlock(&s->lock); + ret = bdrv_co_readv(bs->file, (acb->cluster_offset >> 9) + index_in_cluster, - &acb->hd_qiov, acb->n, qcow_aio_read_cb, acb); - if (acb->hd_aiocb == NULL) { - ret = -EIO; - goto done; + acb->n, &acb->hd_qiov); + qemu_co_mutex_lock(&s->lock); + if (ret < 0) { + return ret; } } - return; + return 1; +} + +static int qcow_co_readv(BlockDriverState *bs, int64_t sector_num, + int nb_sectors, QEMUIOVector *qiov) +{ + BDRVQcowState *s = bs->opaque; + QCowAIOCB *acb; + int ret; + + acb = qcow_aio_setup(bs, sector_num, qiov, nb_sectors, 0); + + qemu_co_mutex_lock(&s->lock); + do { + ret = qcow_aio_read_cb(acb); + } while (ret > 0); + qemu_co_mutex_unlock(&s->lock); -done: if (acb->qiov->niov > 1) { qemu_iovec_from_buffer(acb->qiov, acb->orig_buf, acb->qiov->size); qemu_vfree(acb->orig_buf); } - acb->common.cb(acb->common.opaque, ret); qemu_aio_release(acb); + + return ret; } -static BlockDriverAIOCB *qcow_aio_readv(BlockDriverState *bs, - int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, - BlockDriverCompletionFunc *cb, void *opaque) -{ - QCowAIOCB *acb; - int ret; - - acb = qcow_aio_setup(bs, sector_num, qiov, nb_sectors, cb, opaque, 0); - if (!acb) - return NULL; - - ret = qcow_schedule_bh(qcow_aio_rw_bh, acb); - if (ret < 0) { - if (acb->qiov->niov > 1) { - qemu_vfree(acb->orig_buf); - } - qemu_aio_release(acb); - return NULL; - } - - return &acb->common; -} - -static void qcow_aio_write_cb(void *opaque, int ret) +static int qcow_aio_write_cb(void *opaque) { QCowAIOCB *acb = opaque; BlockDriverState *bs = acb->common.bs; @@ -702,20 +662,17 @@ static void qcow_aio_write_cb(void *opaque, int ret) int index_in_cluster; uint64_t cluster_offset; const uint8_t *src_buf; + int ret; acb->hd_aiocb = NULL; - if (ret < 0) - goto done; - acb->nb_sectors -= acb->n; acb->sector_num += acb->n; acb->buf += acb->n * 512; if (acb->nb_sectors == 0) { /* request completed */ - ret = 0; - goto done; + return 0; } index_in_cluster = acb->sector_num & (s->cluster_sectors - 1); @@ -726,16 +683,11 @@ static void qcow_aio_write_cb(void *opaque, int ret) index_in_cluster, index_in_cluster + acb->n); if (!cluster_offset || (cluster_offset & 511) != 0) { - ret = -EIO; - goto done; + return -EIO; } if (s->crypt_method) { if (!acb->cluster_data) { acb->cluster_data = qemu_mallocz(s->cluster_size); - if (!acb->cluster_data) { - ret = -ENOMEM; - goto done; - } } encrypt_sectors(s, acb->sector_num, acb->cluster_data, acb->buf, acb->n, 1, &s->aes_encrypt_key); @@ -747,26 +699,19 @@ static void qcow_aio_write_cb(void *opaque, int ret) acb->hd_iov.iov_base = (void *)src_buf; acb->hd_iov.iov_len = acb->n * 512; qemu_iovec_init_external(&acb->hd_qiov, &acb->hd_iov, 1); - acb->hd_aiocb = bdrv_aio_writev(bs->file, - (cluster_offset >> 9) + index_in_cluster, - &acb->hd_qiov, acb->n, - qcow_aio_write_cb, acb); - if (acb->hd_aiocb == NULL) { - ret = -EIO; - goto done; + qemu_co_mutex_unlock(&s->lock); + ret = bdrv_co_writev(bs->file, + (cluster_offset >> 9) + index_in_cluster, + acb->n, &acb->hd_qiov); + qemu_co_mutex_lock(&s->lock); + if (ret < 0) { + return ret; } - return; - -done: - if (acb->qiov->niov > 1) - qemu_vfree(acb->orig_buf); - acb->common.cb(acb->common.opaque, ret); - qemu_aio_release(acb); + return 1; } -static BlockDriverAIOCB *qcow_aio_writev(BlockDriverState *bs, - int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, - BlockDriverCompletionFunc *cb, void *opaque) +static int qcow_co_writev(BlockDriverState *bs, int64_t sector_num, + int nb_sectors, QEMUIOVector *qiov) { BDRVQcowState *s = bs->opaque; QCowAIOCB *acb; @@ -774,21 +719,20 @@ static BlockDriverAIOCB *qcow_aio_writev(BlockDriverState *bs, s->cluster_cache_offset = -1; /* disable compressed cache */ - acb = qcow_aio_setup(bs, sector_num, qiov, nb_sectors, cb, opaque, 1); - if (!acb) - return NULL; + acb = qcow_aio_setup(bs, sector_num, qiov, nb_sectors, 1); + qemu_co_mutex_lock(&s->lock); + do { + ret = qcow_aio_write_cb(acb); + } while (ret > 0); + qemu_co_mutex_unlock(&s->lock); - ret = qcow_schedule_bh(qcow_aio_rw_bh, acb); - if (ret < 0) { - if (acb->qiov->niov > 1) { - qemu_vfree(acb->orig_buf); - } - qemu_aio_release(acb); - return NULL; + if (acb->qiov->niov > 1) { + qemu_vfree(acb->orig_buf); } + qemu_aio_release(acb); - return &acb->common; + return ret; } static void qcow_close(BlockDriverState *bs) @@ -1020,8 +964,8 @@ static BlockDriver bdrv_qcow = { .bdrv_is_allocated = qcow_is_allocated, .bdrv_set_key = qcow_set_key, .bdrv_make_empty = qcow_make_empty, - .bdrv_aio_readv = qcow_aio_readv, - .bdrv_aio_writev = qcow_aio_writev, + .bdrv_co_readv = qcow_co_readv, + .bdrv_co_writev = qcow_co_writev, .bdrv_aio_flush = qcow_aio_flush, .bdrv_write_compressed = qcow_write_compressed, .bdrv_get_info = qcow_get_info, From 384acbf46b70edf0d2c1648aa1a92a90bcf7057d Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Fri, 15 Jul 2011 16:36:40 +0200 Subject: [PATCH 26/29] async: Remove AsyncContext The purpose of AsyncContexts was to protect qcow and qcow2 against reentrancy during an emulated bdrv_read/write (which includes a qemu_aio_wait() call and can run AIO callbacks of different requests if it weren't for AsyncContexts). Now both qcow and qcow2 are protected by CoMutexes and AsyncContexts can be removed. Signed-off-by: Kevin Wolf --- async.c | 98 ++++------------------------------------------ block.c | 6 --- block/qed-table.c | 14 ------- block/qed.c | 4 -- linux-aio.c | 43 ++------------------ posix-aio-compat.c | 11 ------ qemu-common.h | 4 -- 7 files changed, 11 insertions(+), 169 deletions(-) diff --git a/async.c b/async.c index fd313dffb7..3fe70b9deb 100644 --- a/async.c +++ b/async.c @@ -25,92 +25,8 @@ #include "qemu-common.h" #include "qemu-aio.h" -/* - * An AsyncContext protects the callbacks of AIO requests and Bottom Halves - * against interfering with each other. A typical example is qcow2 that accepts - * asynchronous requests, but relies for manipulation of its metadata on - * synchronous bdrv_read/write that doesn't trigger any callbacks. - * - * However, these functions are often emulated using AIO which means that AIO - * callbacks must be run - but at the same time we must not run callbacks of - * other requests as they might start to modify metadata and corrupt the - * internal state of the caller of bdrv_read/write. - * - * To achieve the desired semantics we switch into a new AsyncContext. - * Callbacks must only be run if they belong to the current AsyncContext. - * Otherwise they need to be queued until their own context is active again. - * This is how you can make qemu_aio_wait() wait only for your own callbacks. - * - * The AsyncContexts form a stack. When you leave a AsyncContexts, you always - * return to the old ("parent") context. - */ -struct AsyncContext { - /* Consecutive number of the AsyncContext (position in the stack) */ - int id; - - /* Anchor of the list of Bottom Halves belonging to the context */ - struct QEMUBH *first_bh; - - /* Link to parent context */ - struct AsyncContext *parent; -}; - -/* The currently active AsyncContext */ -static struct AsyncContext *async_context = &(struct AsyncContext) { 0 }; - -/* - * Enter a new AsyncContext. Already scheduled Bottom Halves and AIO callbacks - * won't be called until this context is left again. - */ -void async_context_push(void) -{ - struct AsyncContext *new = qemu_mallocz(sizeof(*new)); - new->parent = async_context; - new->id = async_context->id + 1; - async_context = new; -} - -/* Run queued AIO completions and destroy Bottom Half */ -static void bh_run_aio_completions(void *opaque) -{ - QEMUBH **bh = opaque; - qemu_bh_delete(*bh); - qemu_free(bh); - qemu_aio_process_queue(); -} -/* - * Leave the currently active AsyncContext. All Bottom Halves belonging to the - * old context are executed before changing the context. - */ -void async_context_pop(void) -{ - struct AsyncContext *old = async_context; - QEMUBH **bh; - - /* Flush the bottom halves, we don't want to lose them */ - while (qemu_bh_poll()); - - /* Switch back to the parent context */ - async_context = async_context->parent; - qemu_free(old); - - if (async_context == NULL) { - abort(); - } - - /* Schedule BH to run any queued AIO completions as soon as possible */ - bh = qemu_malloc(sizeof(*bh)); - *bh = qemu_bh_new(bh_run_aio_completions, bh); - qemu_bh_schedule(*bh); -} - -/* - * Returns the ID of the currently active AsyncContext - */ -int get_async_context_id(void) -{ - return async_context->id; -} +/* Anchor of the list of Bottom Halves belonging to the context */ +static struct QEMUBH *first_bh; /***********************************************************/ /* bottom halves (can be seen as timers which expire ASAP) */ @@ -130,8 +46,8 @@ QEMUBH *qemu_bh_new(QEMUBHFunc *cb, void *opaque) bh = qemu_mallocz(sizeof(QEMUBH)); bh->cb = cb; bh->opaque = opaque; - bh->next = async_context->first_bh; - async_context->first_bh = bh; + bh->next = first_bh; + first_bh = bh; return bh; } @@ -141,7 +57,7 @@ int qemu_bh_poll(void) int ret; ret = 0; - for (bh = async_context->first_bh; bh; bh = next) { + for (bh = first_bh; bh; bh = next) { next = bh->next; if (!bh->deleted && bh->scheduled) { bh->scheduled = 0; @@ -153,7 +69,7 @@ int qemu_bh_poll(void) } /* remove deleted bhs */ - bhp = &async_context->first_bh; + bhp = &first_bh; while (*bhp) { bh = *bhp; if (bh->deleted) { @@ -199,7 +115,7 @@ void qemu_bh_update_timeout(int *timeout) { QEMUBH *bh; - for (bh = async_context->first_bh; bh; bh = bh->next) { + for (bh = first_bh; bh; bh = bh->next) { if (!bh->deleted && bh->scheduled) { if (bh->idle) { /* idle bottom halves will be polled at least diff --git a/block.c b/block.c index e6abea85df..0d05b4b32d 100644 --- a/block.c +++ b/block.c @@ -2777,8 +2777,6 @@ static int bdrv_read_em(BlockDriverState *bs, int64_t sector_num, struct iovec iov; QEMUIOVector qiov; - async_context_push(); - async_ret = NOT_DONE; iov.iov_base = (void *)buf; iov.iov_len = nb_sectors * BDRV_SECTOR_SIZE; @@ -2796,7 +2794,6 @@ static int bdrv_read_em(BlockDriverState *bs, int64_t sector_num, fail: - async_context_pop(); return async_ret; } @@ -2808,8 +2805,6 @@ static int bdrv_write_em(BlockDriverState *bs, int64_t sector_num, struct iovec iov; QEMUIOVector qiov; - async_context_push(); - async_ret = NOT_DONE; iov.iov_base = (void *)buf; iov.iov_len = nb_sectors * BDRV_SECTOR_SIZE; @@ -2825,7 +2820,6 @@ static int bdrv_write_em(BlockDriverState *bs, int64_t sector_num, } fail: - async_context_pop(); return async_ret; } diff --git a/block/qed-table.c b/block/qed-table.c index d38c673547..d96afa81d7 100644 --- a/block/qed-table.c +++ b/block/qed-table.c @@ -179,16 +179,12 @@ int qed_read_l1_table_sync(BDRVQEDState *s) { int ret = -EINPROGRESS; - async_context_push(); - qed_read_table(s, s->header.l1_table_offset, s->l1_table, qed_sync_cb, &ret); while (ret == -EINPROGRESS) { qemu_aio_wait(); } - async_context_pop(); - return ret; } @@ -205,15 +201,11 @@ int qed_write_l1_table_sync(BDRVQEDState *s, unsigned int index, { int ret = -EINPROGRESS; - async_context_push(); - qed_write_l1_table(s, index, n, qed_sync_cb, &ret); while (ret == -EINPROGRESS) { qemu_aio_wait(); } - async_context_pop(); - return ret; } @@ -282,14 +274,11 @@ int qed_read_l2_table_sync(BDRVQEDState *s, QEDRequest *request, uint64_t offset { int ret = -EINPROGRESS; - async_context_push(); - qed_read_l2_table(s, request, offset, qed_sync_cb, &ret); while (ret == -EINPROGRESS) { qemu_aio_wait(); } - async_context_pop(); return ret; } @@ -307,13 +296,10 @@ int qed_write_l2_table_sync(BDRVQEDState *s, QEDRequest *request, { int ret = -EINPROGRESS; - async_context_push(); - qed_write_l2_table(s, request, index, n, flush, qed_sync_cb, &ret); while (ret == -EINPROGRESS) { qemu_aio_wait(); } - async_context_pop(); return ret; } diff --git a/block/qed.c b/block/qed.c index 39703793e9..333f067582 100644 --- a/block/qed.c +++ b/block/qed.c @@ -680,16 +680,12 @@ static int bdrv_qed_is_allocated(BlockDriverState *bs, int64_t sector_num, }; QEDRequest request = { .l2_table = NULL }; - async_context_push(); - qed_find_cluster(s, &request, pos, len, qed_is_allocated_cb, &cb); while (cb.is_allocated == -1) { qemu_aio_wait(); } - async_context_pop(); - qed_unref_l2_cache_entry(request.l2_table); return cb.is_allocated; diff --git a/linux-aio.c b/linux-aio.c index 68f4b3d757..dc3faf2499 100644 --- a/linux-aio.c +++ b/linux-aio.c @@ -31,7 +31,6 @@ struct qemu_laiocb { struct iocb iocb; ssize_t ret; size_t nbytes; - int async_context_id; QLIST_ENTRY(qemu_laiocb) node; }; @@ -39,7 +38,6 @@ struct qemu_laio_state { io_context_t ctx; int efd; int count; - QLIST_HEAD(, qemu_laiocb) completed_reqs; }; static inline ssize_t io_event_ret(struct io_event *ev) @@ -49,7 +47,6 @@ static inline ssize_t io_event_ret(struct io_event *ev) /* * Completes an AIO request (calls the callback and frees the ACB). - * Be sure to be in the right AsyncContext before calling this function. */ static void qemu_laio_process_completion(struct qemu_laio_state *s, struct qemu_laiocb *laiocb) @@ -72,42 +69,12 @@ static void qemu_laio_process_completion(struct qemu_laio_state *s, } /* - * Processes all queued AIO requests, i.e. requests that have return from OS - * but their callback was not called yet. Requests that cannot have their - * callback called in the current AsyncContext, remain in the queue. - * - * Returns 1 if at least one request could be completed, 0 otherwise. + * All requests are directly processed when they complete, so there's nothing + * left to do during qemu_aio_wait(). */ static int qemu_laio_process_requests(void *opaque) { - struct qemu_laio_state *s = opaque; - struct qemu_laiocb *laiocb, *next; - int res = 0; - - QLIST_FOREACH_SAFE (laiocb, &s->completed_reqs, node, next) { - if (laiocb->async_context_id == get_async_context_id()) { - qemu_laio_process_completion(s, laiocb); - QLIST_REMOVE(laiocb, node); - res = 1; - } - } - - return res; -} - -/* - * Puts a request in the completion queue so that its callback is called the - * next time when it's possible. If we already are in the right AsyncContext, - * the request is completed immediately instead. - */ -static void qemu_laio_enqueue_completed(struct qemu_laio_state *s, - struct qemu_laiocb* laiocb) -{ - if (laiocb->async_context_id == get_async_context_id()) { - qemu_laio_process_completion(s, laiocb); - } else { - QLIST_INSERT_HEAD(&s->completed_reqs, laiocb, node); - } + return 0; } static void qemu_laio_completion_cb(void *opaque) @@ -141,7 +108,7 @@ static void qemu_laio_completion_cb(void *opaque) container_of(iocb, struct qemu_laiocb, iocb); laiocb->ret = io_event_ret(&events[i]); - qemu_laio_enqueue_completed(s, laiocb); + qemu_laio_process_completion(s, laiocb); } } } @@ -204,7 +171,6 @@ BlockDriverAIOCB *laio_submit(BlockDriverState *bs, void *aio_ctx, int fd, laiocb->nbytes = nb_sectors * 512; laiocb->ctx = s; laiocb->ret = -EINPROGRESS; - laiocb->async_context_id = get_async_context_id(); iocbs = &laiocb->iocb; @@ -239,7 +205,6 @@ void *laio_init(void) struct qemu_laio_state *s; s = qemu_mallocz(sizeof(*s)); - QLIST_INIT(&s->completed_reqs); s->efd = eventfd(0, 0); if (s->efd == -1) goto out_free_state; diff --git a/posix-aio-compat.c b/posix-aio-compat.c index c4116e30f2..788d113860 100644 --- a/posix-aio-compat.c +++ b/posix-aio-compat.c @@ -49,8 +49,6 @@ struct qemu_paiocb { ssize_t ret; int active; struct qemu_paiocb *next; - - int async_context_id; }; typedef struct PosixAioState { @@ -420,7 +418,6 @@ static int posix_aio_process_queue(void *opaque) struct qemu_paiocb *acb, **pacb; int ret; int result = 0; - int async_context_id = get_async_context_id(); for(;;) { pacb = &s->first_aio; @@ -429,12 +426,6 @@ static int posix_aio_process_queue(void *opaque) if (!acb) return result; - /* we're only interested in requests in the right context */ - if (acb->async_context_id != async_context_id) { - pacb = &acb->next; - continue; - } - ret = qemu_paio_error(acb); if (ret == ECANCELED) { /* remove the request */ @@ -575,7 +566,6 @@ BlockDriverAIOCB *paio_submit(BlockDriverState *bs, int fd, acb->aio_type = type; acb->aio_fildes = fd; acb->ev_signo = SIGUSR2; - acb->async_context_id = get_async_context_id(); if (qiov) { acb->aio_iov = qiov->iov; @@ -604,7 +594,6 @@ BlockDriverAIOCB *paio_ioctl(BlockDriverState *bs, int fd, acb->aio_type = QEMU_AIO_IOCTL; acb->aio_fildes = fd; acb->ev_signo = SIGUSR2; - acb->async_context_id = get_async_context_id(); acb->aio_offset = 0; acb->aio_ioctl_buf = buf; acb->aio_ioctl_cmd = req; diff --git a/qemu-common.h b/qemu-common.h index 1e3c66511e..8f21a8cb29 100644 --- a/qemu-common.h +++ b/qemu-common.h @@ -115,10 +115,6 @@ int qemu_main(int argc, char **argv, char **envp); /* bottom halves */ typedef void QEMUBHFunc(void *opaque); -void async_context_push(void); -void async_context_pop(void); -int get_async_context_id(void); - QEMUBH *qemu_bh_new(QEMUBHFunc *cb, void *opaque); void qemu_bh_schedule(QEMUBH *bh); /* Bottom halfs that are scheduled from a bottom half handler are instantly From e680cfa7e20f5049c475ac94f998a79c9997b48d Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Wed, 13 Jul 2011 15:42:52 +0200 Subject: [PATCH 27/29] coroutines: Use one global bottom half for CoQueue Now that AsyncContexts don't exist any more, we can use one global bottom half for restarting coroutines instead of allocating a new one every time (before removing AsyncContexts, the problem with having a global BH was that it had to belong to a single AsyncContexts and wouldn't be executed in a different one - which leads to deadlocks) Signed-off-by: Kevin Wolf --- qemu-coroutine-lock.c | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-) diff --git a/qemu-coroutine-lock.c b/qemu-coroutine-lock.c index abaa1f7967..a80f437c59 100644 --- a/qemu-coroutine-lock.c +++ b/qemu-coroutine-lock.c @@ -30,14 +30,10 @@ static QTAILQ_HEAD(, Coroutine) unlock_bh_queue = QTAILQ_HEAD_INITIALIZER(unlock_bh_queue); - -struct unlock_bh { - QEMUBH *bh; -}; +static QEMUBH* unlock_bh; static void qemu_co_queue_next_bh(void *opaque) { - struct unlock_bh *unlock_bh = opaque; Coroutine *next; trace_qemu_co_queue_next_bh(); @@ -45,14 +41,15 @@ static void qemu_co_queue_next_bh(void *opaque) QTAILQ_REMOVE(&unlock_bh_queue, next, co_queue_next); qemu_coroutine_enter(next, NULL); } - - qemu_bh_delete(unlock_bh->bh); - qemu_free(unlock_bh); } void qemu_co_queue_init(CoQueue *queue) { QTAILQ_INIT(&queue->entries); + + if (!unlock_bh) { + unlock_bh = qemu_bh_new(qemu_co_queue_next_bh, NULL); + } } void coroutine_fn qemu_co_queue_wait(CoQueue *queue) @@ -65,7 +62,6 @@ void coroutine_fn qemu_co_queue_wait(CoQueue *queue) bool qemu_co_queue_next(CoQueue *queue) { - struct unlock_bh *unlock_bh; Coroutine *next; next = QTAILQ_FIRST(&queue->entries); @@ -73,10 +69,7 @@ bool qemu_co_queue_next(CoQueue *queue) QTAILQ_REMOVE(&queue->entries, next, co_queue_next); QTAILQ_INSERT_TAIL(&unlock_bh_queue, next, co_queue_next); trace_qemu_co_queue_next(next); - - unlock_bh = qemu_malloc(sizeof(*unlock_bh)); - unlock_bh->bh = qemu_bh_new(qemu_co_queue_next_bh, unlock_bh); - qemu_bh_schedule(unlock_bh->bh); + qemu_bh_schedule(unlock_bh); } return (next != NULL); From ba1d1afdfeeb5ec6cbfd0052ab298d9b7cb7e5ba Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Mon, 25 Jul 2011 19:42:37 +0200 Subject: [PATCH 28/29] posix-aio-compat: Allow read after EOF In order to be able to transparently replace bdrv_read calls by bdrv_co_read, reading beyond EOF must produce zeros instead of short reads for AIO, too. Signed-off-by: Kevin Wolf --- posix-aio-compat.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/posix-aio-compat.c b/posix-aio-compat.c index 788d113860..8dc00cbb0f 100644 --- a/posix-aio-compat.c +++ b/posix-aio-compat.c @@ -198,6 +198,12 @@ static ssize_t handle_aiocb_rw_vector(struct qemu_paiocb *aiocb) return len; } +/* + * Read/writes the data to/from a given linear buffer. + * + * Returns the number of bytes handles or -errno in case of an error. Short + * reads are only returned if the end of the file is reached. + */ static ssize_t handle_aiocb_rw_linear(struct qemu_paiocb *aiocb, char *buf) { ssize_t offset = 0; @@ -334,6 +340,19 @@ static void *aio_thread(void *unused) switch (aiocb->aio_type & QEMU_AIO_TYPE_MASK) { case QEMU_AIO_READ: + ret = handle_aiocb_rw(aiocb); + if (ret >= 0 && ret < aiocb->aio_nbytes && aiocb->common.bs->growable) { + /* A short read means that we have reached EOF. Pad the buffer + * with zeros for bytes after EOF. */ + QEMUIOVector qiov; + + qemu_iovec_init_external(&qiov, aiocb->aio_iov, + aiocb->aio_niov); + qemu_iovec_memset_skip(&qiov, 0, aiocb->aio_nbytes - ret, ret); + + ret = aiocb->aio_nbytes; + } + break; case QEMU_AIO_WRITE: ret = handle_aiocb_rw(aiocb); break; From e7a8a7837a964e0fe327e6ef8dde02c6a53dd14a Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Fri, 15 Jul 2011 16:05:00 +0200 Subject: [PATCH 29/29] block: Use bdrv_co_* instead of synchronous versions in coroutines If we're already in a coroutine, there is no reason to use the synchronous version of block layer functions when a coroutine one exists. This makes bdrv_read/write/flush use bdrv_co_* when used inside a coroutine. Signed-off-by: Kevin Wolf --- block.c | 56 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) diff --git a/block.c b/block.c index 0d05b4b32d..26910ca143 100644 --- a/block.c +++ b/block.c @@ -70,6 +70,7 @@ static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs, static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs, int64_t sector_num, int nb_sectors, QEMUIOVector *iov); +static int coroutine_fn bdrv_co_flush_em(BlockDriverState *bs); static QTAILQ_HEAD(, BlockDriverState) bdrv_states = QTAILQ_HEAD_INITIALIZER(bdrv_states); @@ -946,6 +947,17 @@ static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num, nb_sectors * BDRV_SECTOR_SIZE); } +static inline bool bdrv_has_async_rw(BlockDriver *drv) +{ + return drv->bdrv_co_readv != bdrv_co_readv_em + || drv->bdrv_aio_readv != bdrv_aio_readv_em; +} + +static inline bool bdrv_has_async_flush(BlockDriver *drv) +{ + return drv->bdrv_aio_flush != bdrv_aio_flush_em; +} + /* return < 0 if error. See bdrv_write() for the return codes */ int bdrv_read(BlockDriverState *bs, int64_t sector_num, uint8_t *buf, int nb_sectors) @@ -954,6 +966,18 @@ int bdrv_read(BlockDriverState *bs, int64_t sector_num, if (!drv) return -ENOMEDIUM; + + if (bdrv_has_async_rw(drv) && qemu_in_coroutine()) { + QEMUIOVector qiov; + struct iovec iov = { + .iov_base = (void *)buf, + .iov_len = nb_sectors * BDRV_SECTOR_SIZE, + }; + + qemu_iovec_init_external(&qiov, &iov, 1); + return bdrv_co_readv(bs, sector_num, nb_sectors, &qiov); + } + if (bdrv_check_request(bs, sector_num, nb_sectors)) return -EIO; @@ -998,8 +1022,21 @@ int bdrv_write(BlockDriverState *bs, int64_t sector_num, const uint8_t *buf, int nb_sectors) { BlockDriver *drv = bs->drv; + if (!bs->drv) return -ENOMEDIUM; + + if (bdrv_has_async_rw(drv) && qemu_in_coroutine()) { + QEMUIOVector qiov; + struct iovec iov = { + .iov_base = (void *)buf, + .iov_len = nb_sectors * BDRV_SECTOR_SIZE, + }; + + qemu_iovec_init_external(&qiov, &iov, 1); + return bdrv_co_writev(bs, sector_num, nb_sectors, &qiov); + } + if (bs->read_only) return -EACCES; if (bdrv_check_request(bs, sector_num, nb_sectors)) @@ -1649,6 +1686,10 @@ int bdrv_flush(BlockDriverState *bs) return 0; } + if (bs->drv && bdrv_has_async_flush(bs->drv) && qemu_in_coroutine()) { + return bdrv_co_flush_em(bs); + } + if (bs->drv && bs->drv->bdrv_flush) { return bs->drv->bdrv_flush(bs); } @@ -2916,6 +2957,21 @@ static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs, return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true); } +static int coroutine_fn bdrv_co_flush_em(BlockDriverState *bs) +{ + CoroutineIOCompletion co = { + .coroutine = qemu_coroutine_self(), + }; + BlockDriverAIOCB *acb; + + acb = bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co); + if (!acb) { + return -EIO; + } + qemu_coroutine_yield(); + return co.ret; +} + /**************************************************************/ /* removable device support */