diff --git a/qemu-img.c b/qemu-img.c index 9dddfbefce..8d30e43b53 100644 --- a/qemu-img.c +++ b/qemu-img.c @@ -1305,20 +1305,312 @@ out3: return ret; } +enum ImgConvertBlockStatus { + BLK_DATA, + BLK_ZERO, + BLK_BACKING_FILE, +}; + +typedef struct ImgConvertState { + BlockBackend **src; + int64_t *src_sectors; + int src_cur, src_num; + int64_t src_cur_offset; + int64_t total_sectors; + int64_t allocated_sectors; + enum ImgConvertBlockStatus status; + int64_t sector_next_status; + BlockBackend *target; + bool has_zero_init; + bool compressed; + bool target_has_backing; + int min_sparse; + size_t cluster_sectors; + size_t buf_sectors; +} ImgConvertState; + +static void convert_select_part(ImgConvertState *s, int64_t sector_num) +{ + assert(sector_num >= s->src_cur_offset); + while (sector_num - s->src_cur_offset >= s->src_sectors[s->src_cur]) { + s->src_cur_offset += s->src_sectors[s->src_cur]; + s->src_cur++; + assert(s->src_cur < s->src_num); + } +} + +static int convert_iteration_sectors(ImgConvertState *s, int64_t sector_num) +{ + int64_t ret; + int n; + + convert_select_part(s, sector_num); + + assert(s->total_sectors > sector_num); + n = MIN(s->total_sectors - sector_num, BDRV_REQUEST_MAX_SECTORS); + + if (s->sector_next_status <= sector_num) { + ret = bdrv_get_block_status(blk_bs(s->src[s->src_cur]), + sector_num - s->src_cur_offset, + n, &n); + if (ret < 0) { + return ret; + } + + if (ret & BDRV_BLOCK_ZERO) { + s->status = BLK_ZERO; + } else if (ret & BDRV_BLOCK_DATA) { + s->status = BLK_DATA; + } else if (!s->target_has_backing) { + /* Without a target backing file we must copy over the contents of + * the backing file as well. */ + /* TODO Check block status of the backing file chain to avoid + * needlessly reading zeroes and limiting the iteration to the + * buffer size */ + s->status = BLK_DATA; + } else { + s->status = BLK_BACKING_FILE; + } + + s->sector_next_status = sector_num + n; + } + + n = MIN(n, s->sector_next_status - sector_num); + if (s->status == BLK_DATA) { + n = MIN(n, s->buf_sectors); + } + + /* We need to write complete clusters for compressed images, so if an + * unallocated area is shorter than that, we must consider the whole + * cluster allocated. */ + if (s->compressed) { + if (n < s->cluster_sectors) { + n = MIN(s->cluster_sectors, s->total_sectors - sector_num); + s->status = BLK_DATA; + } else { + n = QEMU_ALIGN_DOWN(n, s->cluster_sectors); + } + } + + return n; +} + +static int convert_read(ImgConvertState *s, int64_t sector_num, int nb_sectors, + uint8_t *buf) +{ + int n; + int ret; + + if (s->status == BLK_ZERO || s->status == BLK_BACKING_FILE) { + return 0; + } + + assert(nb_sectors <= s->buf_sectors); + while (nb_sectors > 0) { + BlockBackend *blk; + int64_t bs_sectors; + + /* In the case of compression with multiple source files, we can get a + * nb_sectors that spreads into the next part. So we must be able to + * read across multiple BDSes for one convert_read() call. */ + convert_select_part(s, sector_num); + blk = s->src[s->src_cur]; + bs_sectors = s->src_sectors[s->src_cur]; + + n = MIN(nb_sectors, bs_sectors - (sector_num - s->src_cur_offset)); + ret = blk_read(blk, sector_num - s->src_cur_offset, buf, n); + if (ret < 0) { + return ret; + } + + sector_num += n; + nb_sectors -= n; + buf += n * BDRV_SECTOR_SIZE; + } + + return 0; +} + +static int convert_write(ImgConvertState *s, int64_t sector_num, int nb_sectors, + const uint8_t *buf) +{ + int ret; + + while (nb_sectors > 0) { + int n = nb_sectors; + + switch (s->status) { + case BLK_BACKING_FILE: + /* If we have a backing file, leave clusters unallocated that are + * unallocated in the source image, so that the backing file is + * visible at the respective offset. */ + assert(s->target_has_backing); + break; + + case BLK_DATA: + /* We must always write compressed clusters as a whole, so don't + * try to find zeroed parts in the buffer. We can only save the + * write if the buffer is completely zeroed and we're allowed to + * keep the target sparse. */ + if (s->compressed) { + if (s->has_zero_init && s->min_sparse && + buffer_is_zero(buf, n * BDRV_SECTOR_SIZE)) + { + assert(!s->target_has_backing); + break; + } + + ret = blk_write_compressed(s->target, sector_num, buf, n); + if (ret < 0) { + return ret; + } + break; + } + + /* If there is real non-zero data or we're told to keep the target + * fully allocated (-S 0), we must write it. Otherwise we can treat + * it as zero sectors. */ + if (!s->min_sparse || + is_allocated_sectors_min(buf, n, &n, s->min_sparse)) + { + ret = blk_write(s->target, sector_num, buf, n); + if (ret < 0) { + return ret; + } + break; + } + /* fall-through */ + + case BLK_ZERO: + if (s->has_zero_init) { + break; + } + ret = blk_write_zeroes(s->target, sector_num, n, 0); + if (ret < 0) { + return ret; + } + break; + } + + sector_num += n; + nb_sectors -= n; + buf += n * BDRV_SECTOR_SIZE; + } + + return 0; +} + +static int convert_do_copy(ImgConvertState *s) +{ + uint8_t *buf = NULL; + int64_t sector_num, allocated_done; + int ret; + int n; + + /* Check whether we have zero initialisation or can get it efficiently */ + s->has_zero_init = s->min_sparse && !s->target_has_backing + ? bdrv_has_zero_init(blk_bs(s->target)) + : false; + + if (!s->has_zero_init && !s->target_has_backing && + bdrv_can_write_zeroes_with_unmap(blk_bs(s->target))) + { + ret = bdrv_make_zero(blk_bs(s->target), BDRV_REQ_MAY_UNMAP); + if (ret == 0) { + s->has_zero_init = true; + } + } + + /* Allocate buffer for copied data. For compressed images, only one cluster + * can be copied at a time. */ + if (s->compressed) { + if (s->cluster_sectors <= 0 || s->cluster_sectors > s->buf_sectors) { + error_report("invalid cluster size"); + ret = -EINVAL; + goto fail; + } + s->buf_sectors = s->cluster_sectors; + } + buf = blk_blockalign(s->target, s->buf_sectors * BDRV_SECTOR_SIZE); + + /* Calculate allocated sectors for progress */ + s->allocated_sectors = 0; + sector_num = 0; + while (sector_num < s->total_sectors) { + n = convert_iteration_sectors(s, sector_num); + if (n < 0) { + ret = n; + goto fail; + } + if (s->status == BLK_DATA) { + s->allocated_sectors += n; + } + sector_num += n; + } + + /* Do the copy */ + s->src_cur = 0; + s->src_cur_offset = 0; + s->sector_next_status = 0; + + sector_num = 0; + allocated_done = 0; + + while (sector_num < s->total_sectors) { + n = convert_iteration_sectors(s, sector_num); + if (n < 0) { + ret = n; + goto fail; + } + if (s->status == BLK_DATA) { + allocated_done += n; + qemu_progress_print(100.0 * allocated_done / s->allocated_sectors, + 0); + } + + ret = convert_read(s, sector_num, n, buf); + if (ret < 0) { + error_report("error while reading sector %" PRId64 + ": %s", sector_num, strerror(-ret)); + goto fail; + } + + ret = convert_write(s, sector_num, n, buf); + if (ret < 0) { + error_report("error while writing sector %" PRId64 + ": %s", sector_num, strerror(-ret)); + goto fail; + } + + sector_num += n; + } + + if (s->compressed) { + /* signal EOF to align */ + ret = blk_write_compressed(s->target, 0, NULL, 0); + if (ret < 0) { + goto fail; + } + } + + ret = 0; +fail: + qemu_vfree(buf); + return ret; +} + static int img_convert(int argc, char **argv) { - int c, n, n1, bs_n, bs_i, compress, cluster_sectors, skip_create; + int c, bs_n, bs_i, compress, cluster_sectors, skip_create; int64_t ret = 0; int progress = 0, flags, src_flags; const char *fmt, *out_fmt, *cache, *src_cache, *out_baseimg, *out_filename; BlockDriver *drv, *proto_drv; BlockBackend **blk = NULL, *out_blk = NULL; BlockDriverState **bs = NULL, *out_bs = NULL; - int64_t total_sectors, nb_sectors, sector_num, bs_offset; + int64_t total_sectors; int64_t *bs_sectors = NULL; - uint8_t * buf = NULL; size_t bufsectors = IO_BUF_SIZE / BDRV_SECTOR_SIZE; - const uint8_t *buf1; BlockDriverInfo bdi; QemuOpts *opts = NULL; QemuOptsList *create_opts = NULL; @@ -1329,6 +1621,7 @@ static int img_convert(int argc, char **argv) bool quiet = false; Error *local_err = NULL; QemuOpts *sn_opts = NULL; + ImgConvertState state; fmt = NULL; out_fmt = "raw"; @@ -1627,9 +1920,6 @@ static int img_convert(int argc, char **argv) } out_bs = blk_bs(out_blk); - bs_i = 0; - bs_offset = 0; - /* increase bufsectors from the default 4096 (2M) if opt_transfer_length * or discard_alignment of the out_bs is greater. Limit to 32768 (16MB) * as maximum. */ @@ -1638,8 +1928,6 @@ static int img_convert(int argc, char **argv) out_bs->bl.discard_alignment)) ); - buf = blk_blockalign(out_blk, bufsectors * BDRV_SECTOR_SIZE); - if (skip_create) { int64_t output_sectors = blk_nb_sectors(out_blk); if (output_sectors < 0) { @@ -1666,203 +1954,20 @@ static int img_convert(int argc, char **argv) cluster_sectors = bdi.cluster_size / BDRV_SECTOR_SIZE; } - if (compress) { - if (cluster_sectors <= 0 || cluster_sectors > bufsectors) { - error_report("invalid cluster size"); - ret = -1; - goto out; - } - sector_num = 0; + state = (ImgConvertState) { + .src = blk, + .src_sectors = bs_sectors, + .src_num = bs_n, + .total_sectors = total_sectors, + .target = out_blk, + .compressed = compress, + .target_has_backing = (bool) out_baseimg, + .min_sparse = min_sparse, + .cluster_sectors = cluster_sectors, + .buf_sectors = bufsectors, + }; + ret = convert_do_copy(&state); - nb_sectors = total_sectors; - - for(;;) { - int64_t bs_num; - int remainder; - uint8_t *buf2; - - nb_sectors = total_sectors - sector_num; - if (nb_sectors <= 0) - break; - if (nb_sectors >= cluster_sectors) - n = cluster_sectors; - else - n = nb_sectors; - - bs_num = sector_num - bs_offset; - assert (bs_num >= 0); - remainder = n; - buf2 = buf; - while (remainder > 0) { - int nlow; - while (bs_num == bs_sectors[bs_i]) { - bs_offset += bs_sectors[bs_i]; - bs_i++; - assert (bs_i < bs_n); - bs_num = 0; - /* printf("changing part: sector_num=%" PRId64 ", " - "bs_i=%d, bs_offset=%" PRId64 ", bs_sectors=%" PRId64 - "\n", sector_num, bs_i, bs_offset, bs_sectors[bs_i]); */ - } - assert (bs_num < bs_sectors[bs_i]); - - nlow = remainder > bs_sectors[bs_i] - bs_num - ? bs_sectors[bs_i] - bs_num : remainder; - - ret = blk_read(blk[bs_i], bs_num, buf2, nlow); - if (ret < 0) { - error_report("error while reading sector %" PRId64 ": %s", - bs_num, strerror(-ret)); - goto out; - } - - buf2 += nlow * 512; - bs_num += nlow; - - remainder -= nlow; - } - assert (remainder == 0); - - if (!buffer_is_zero(buf, n * BDRV_SECTOR_SIZE)) { - ret = blk_write_compressed(out_blk, sector_num, buf, n); - if (ret != 0) { - error_report("error while compressing sector %" PRId64 - ": %s", sector_num, strerror(-ret)); - goto out; - } - } - sector_num += n; - qemu_progress_print(100.0 * sector_num / total_sectors, 0); - } - /* signal EOF to align */ - blk_write_compressed(out_blk, 0, NULL, 0); - } else { - int64_t sectors_to_read, sectors_read, sector_num_next_status; - bool count_allocated_sectors; - int has_zero_init = min_sparse ? bdrv_has_zero_init(out_bs) : 0; - - if (!has_zero_init && bdrv_can_write_zeroes_with_unmap(out_bs)) { - ret = bdrv_make_zero(out_bs, BDRV_REQ_MAY_UNMAP); - if (ret < 0) { - goto out; - } - has_zero_init = 1; - } - - sectors_to_read = total_sectors; - count_allocated_sectors = progress && (out_baseimg || has_zero_init); -restart: - sector_num = 0; // total number of sectors converted so far - sectors_read = 0; - sector_num_next_status = 0; - - for(;;) { - nb_sectors = total_sectors - sector_num; - if (nb_sectors <= 0) { - if (count_allocated_sectors) { - sectors_to_read = sectors_read; - count_allocated_sectors = false; - goto restart; - } - ret = 0; - break; - } - - while (sector_num - bs_offset >= bs_sectors[bs_i]) { - bs_offset += bs_sectors[bs_i]; - bs_i ++; - assert (bs_i < bs_n); - /* printf("changing part: sector_num=%" PRId64 ", bs_i=%d, " - "bs_offset=%" PRId64 ", bs_sectors=%" PRId64 "\n", - sector_num, bs_i, bs_offset, bs_sectors[bs_i]); */ - } - - if ((out_baseimg || has_zero_init) && - sector_num >= sector_num_next_status) { - n = nb_sectors > INT_MAX ? INT_MAX : nb_sectors; - ret = bdrv_get_block_status(bs[bs_i], sector_num - bs_offset, - n, &n1); - if (ret < 0) { - error_report("error while reading block status of sector %" - PRId64 ": %s", sector_num - bs_offset, - strerror(-ret)); - goto out; - } - /* If the output image is zero initialized, we are not working - * on a shared base and the input is zero we can skip the next - * n1 sectors */ - if (has_zero_init && !out_baseimg && (ret & BDRV_BLOCK_ZERO)) { - sector_num += n1; - continue; - } - /* If the output image is being created as a copy on write - * image, assume that sectors which are unallocated in the - * input image are present in both the output's and input's - * base images (no need to copy them). */ - if (out_baseimg) { - if (!(ret & BDRV_BLOCK_DATA)) { - sector_num += n1; - continue; - } - /* The next 'n1' sectors are allocated in the input image. - * Copy only those as they may be followed by unallocated - * sectors. */ - nb_sectors = n1; - } - /* avoid redundant callouts to get_block_status */ - sector_num_next_status = sector_num + n1; - } - - n = MIN(nb_sectors, bufsectors); - - /* round down request length to an aligned sector, but - * do not bother doing this on short requests. They happen - * when we found an all-zero area, and the next sector to - * write will not be sector_num + n. */ - if (cluster_sectors > 0 && n >= cluster_sectors) { - int64_t next_aligned_sector = (sector_num + n); - next_aligned_sector -= next_aligned_sector % cluster_sectors; - if (sector_num + n > next_aligned_sector) { - n = next_aligned_sector - sector_num; - } - } - - n = MIN(n, bs_sectors[bs_i] - (sector_num - bs_offset)); - - sectors_read += n; - if (count_allocated_sectors) { - sector_num += n; - continue; - } - - n1 = n; - ret = blk_read(blk[bs_i], sector_num - bs_offset, buf, n); - if (ret < 0) { - error_report("error while reading sector %" PRId64 ": %s", - sector_num - bs_offset, strerror(-ret)); - goto out; - } - /* NOTE: at the same time we convert, we do not write zero - sectors to have a chance to compress the image. Ideally, we - should add a specific call to have the info to go faster */ - buf1 = buf; - while (n > 0) { - if (!has_zero_init || - is_allocated_sectors_min(buf1, n, &n1, min_sparse)) { - ret = blk_write(out_blk, sector_num, buf1, n1); - if (ret < 0) { - error_report("error while writing sector %" PRId64 - ": %s", sector_num, strerror(-ret)); - goto out; - } - } - sector_num += n1; - n -= n1; - buf1 += n1 * 512; - } - qemu_progress_print(100.0 * sectors_read / sectors_to_read, 0); - } - } out: if (!ret) { qemu_progress_print(100, 0); @@ -1870,7 +1975,6 @@ out: qemu_progress_end(); qemu_opts_del(opts); qemu_opts_free(create_opts); - qemu_vfree(buf); qemu_opts_del(sn_opts); blk_unref(out_blk); g_free(bs);