sftl/sftl.c

717 lines
18 KiB
C
Raw Blame History

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

/*
* Simple log-structured translation layer for FTLed flash drives
* like memory cards or USB sticks.
*
* (C) 2013 Vitaliy Filippov <vitalif at mail d0t ru>
* Redistributable under the terms of the GNU GPL 3.0+.
*/
#include <linux/module.h>
#include <linux/moduleparam.h>
#include <linux/init.h>
#include <linux/kernel.h> /* printk() */
#include <linux/fs.h> /* everything... */
#include <linux/errno.h> /* error codes */
#include <linux/types.h> /* size_t */
#include <linux/vmalloc.h>
#include <linux/genhd.h>
#include <linux/idr.h>
#include <linux/blkdev.h>
#include <linux/hdreg.h>
#define KERNEL_SECTOR_SIZE 512
#define ERROR(fmt, args...) printk(KERN_ERR "sftl: " fmt "\n" , ## args)
#define INFO(fmt, args...) printk(KERN_INFO "sftl: " fmt "\n" , ## args)
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Vitaliy Filippov <vitalif@mail.ru>");
MODULE_DESCRIPTION("Log-structured translation layer for USB sticks and memory cards");
MODULE_VERSION("0.1");
static int major_num = 0;
/* Cluster is a mapping unit
Segment is a sequence of clusters having a 1 separate sector for mapping metadata */
const u32 magic = 0x4C544673; /* Magic: sFTL */
const int phy_sz = 512; /* Kernel and physical sector/block size */
const int clust_sz = 4096; /* Cluster size in bytes */
const int clust_blocks = 4096/512; /* Cluster size in blocks (8) */
const int seg_clust = 512/16; /* Segment size in clusters (32) */
/* Mapping element */
struct sftl_map {
u32 magic, block, ver, checksum;
};
/* Trivial checksum using some 32-bit prime number */
#define sftl_map_checksum(m) ((u32)((1+(m).block+((m).magic>>16))*(1+(m).ver+((m).magic&0xFFFF))*0xC4489EF5))
/* The internal representation of our device */
struct sftl_dev {
// Device parameters
u32 size; // device size in physical blocks
u32 segs; // device size in segments
u32 reserved_segs; // segments reserved for defragmentation during write
u32 *map; // virtual-to-real cluster map
u32 *clust_map; // real-to-virtual cluster map, value=(1+virtual cluster number), 0 means empty
u32 *ver; // cluster versions indexed by their virtual positions
u32 freeclust, freesegs; // free cluster count, free segment count
u32 free_start, free_size; // current free segment sequence, always contains at least @seg_clust-1 segments
// Buffer to hold pending writes - will hold up to a complete segment
char *buf;
u32 buf_max, buf_size;
// Kernel objects
struct gendisk *gd;
struct block_device *blkdev;
struct request_queue *queue;
struct mutex write_mutex;
struct list_head list;
};
/* Index allocator */
static DEFINE_SPINLOCK(sftl_index_lock);
static DEFINE_IDA(sftl_index_ida);
/* Our block device list, used in cleanup_module */
static LIST_HEAD(sftl_device_list);
static void sync_io(struct block_device *bdev, sector_t sector, void *buf, unsigned len, int rw);
static long bio_submit_kern_seq(
struct block_device *bdev, void *data, unsigned int len, gfp_t gfp_mask,
sector_t sector, void *private, bio_end_io_t *endio, int rw);
static void sftl_complete_seg(struct bio *bio, int err)
{
bio_endio((struct bio *)bio->bi_private, err);
bio_put(bio);
}
struct sftl_buf_info
{
struct bio *complete_bio;
void *free_buf;
};
static void sftl_complete_buf(struct bio *bio, int err)
{
struct sftl_buf_info *i = bio->bi_private;
bio_endio(i->complete_bio, err);
bio_put(bio);
kfree(i->free_buf);
kfree(i);
}
static void sftl_make_request(struct request_queue *q, struct bio *bio)
{
struct sftl_dev *sftl = (struct sftl_dev*)q->queuedata;
BUG_ON(bio->bi_vcnt > 1);
BUG_ON(bio->bi_sector % clust_blocks);
BUG_ON(bio->bi_size != clust_sz);
if (bio->bi_sector > sftl->size)
{
INFO("Beyond-end i/o (starting sector = %lu)", (unsigned long)bio->bi_sector);
bio_endio(bio, -EIO);
}
else if (!bio_rw(bio))
{
if (!sftl->ver[bio->bi_sector/clust_blocks])
{
// version=0 => unallocated cluster
zero_fill_bio(bio);
bio_endio(bio, 0);
}
else
{
struct block_device *bdev = sftl->blkdev;
struct request_queue *q = bdev_get_queue(bdev);
struct bio *bb = bio_alloc(GFP_KERNEL, 1);
u32 m;
if (IS_ERR(bb))
return;
bio_add_pc_page(q, bb, bio_page(bio), bio->bi_size, bio_offset(bio));
m = sftl->map[bio->bi_sector/clust_blocks];
bb->bi_sector = m/seg_clust * (seg_clust*clust_blocks + 1) + (m%seg_clust)*clust_blocks;
bb->bi_bdev = bdev;
bb->bi_private = bio;
bb->bi_end_io = sftl_complete_seg;
submit_bio(READ, bb);
if (!(bb->bi_flags & (1 << BIO_UPTODATE)))
{
bio_put(bb);
bio_endio(bio, -EIO);
}
}
}
else
{
// FIXME Is concurrent writing OK? Do we need any write locks?
struct sftl_map *buf_map = (struct sftl_map *)(sftl->buf + seg_clust*clust_sz) + sftl->buf_size;
char *buffer = __bio_kmap_atomic(bio, 0, KM_USER0);
memcpy(sftl->buf + clust_sz*sftl->buf_size, buffer, clust_sz);
__bio_kunmap_atomic(bio, KM_USER0);
buf_map->magic = magic;
buf_map->block = bio->bi_sector/clust_blocks;
buf_map->ver = sftl->ver[bio->bi_sector/clust_blocks]+1;
buf_map->checksum = sftl_map_checksum(*buf_map);
sftl->map[bio->bi_sector/clust_blocks] = sftl->nextfreeseg*seg_clust + sftl->buf_size;
sftl->clust_map[sftl->nextfreeseg*seg_clust + sftl->buf_size] = 1 + bio->bi_sector/clust_blocks;
sftl->ver[bio->bi_sector/clust_blocks] = buf_map->ver;
sftl->buf_size++;
INFO("Write request (starting sector = %lu, count = %lu)",
(unsigned long)bio->bi_sector, (unsigned long)bio_sectors(bio));
if (sftl->buf_size >= sftl->buf_max)
{
// Need to flush current buffer before completing this bio
void *buf = sftl->buf;
struct sftl_buf_info *info = kmalloc(sizeof(struct sftl_buf_info), GFP_KERNEL);
int err;
info->free_buf = buf;
info->complete_bio = bio;
// We stupidly switch buffer (there always will be a finite number of those)
sftl->buf = kmalloc(seg_clust*clust_sz + phy_sz, GFP_KERNEL);
sftl->buf_size = 0;
err = bio_submit_kern_seq(sftl->blkdev, buf, seg_clust*clust_sz+phy_sz, GFP_KERNEL,
sftl->nextfreeseg*(seg_clust*clust_blocks+1), info, sftl_complete_buf, WRITE);
if (err)
{
bio_endio(bio, -EIO);
kfree(buf);
kfree(info);
}
// FIXME Correctly adjust free segment address
sftl->freeclust -= seg_clust;
sftl->freesegs--;
// Find @seg_clust free clusters to form a free segment
u32 defrag[seg_clust];
u32 prev = sftl->nextfreeseg;
int n = 0, m, n_defrag = 0;
while (n < seg_clust)
{
i = prev;
do
{
i = (i+1) % sftl->segs;
} while (i != prev && !(m = free_clusters_count(i)));
BUG_ON(i == prev); // infinite loop prevented
n += m;
defrag[n_defrag++] = prev = i;
}
if (n_defrag > 1)
{
// Next free segment is fragmented
for (i = 0; i < n_defrag; i++)
{
for (j = 0; j < seg_clust; j++)
{
if (virt = sftl->clust_map[defrag[i]*seg_clust+j])
{
@read(sftl, virt, buf);
@write(sftl, virt, buf);
Алё, а куда писать-то?
}
}
}
}
sftl->nextfreeseg = defrag[0];
}
else
bio_endio(bio, 0);
}
}
/*
* The HDIO_GETGEO ioctl is handled in blkdev_ioctl(), which
* calls this. We need to implement getgeo, since we can't
* use tools such as fdisk to partition the drive otherwise.
*/
int sftl_getgeo(struct block_device * block_device, struct hd_geometry * geo)
{
geo->cylinders = (get_capacity(block_device->bd_disk) & ~0x3f) >> 6;
geo->heads = 4;
geo->sectors = 16;
geo->start = 0;
return 0;
}
/*
* The device operations structure.
*/
static struct block_device_operations sftl_ops = {
.owner = THIS_MODULE,
.getgeo = sftl_getgeo
};
static int sftl_reg_major(void)
{
if (!major_num)
{
/* Register major number */
major_num = register_blkdev(major_num, "sftl");
if (major_num < 0)
{
printk(KERN_WARNING "sftl: unable to get major number\n");
return -1;
}
}
return 0;
}
static int __init sftl_init(void)
{
return sftl_reg_major();
}
static void sftl_free_device(struct sftl_dev *dev)
{
if (!dev)
return;
if (dev->buf_size)
{
INFO("Flushing %d pending clusters", dev->buf_size);
sync_io(dev->blkdev, dev->nextfreeseg*(seg_clust*clust_blocks+1), dev->buf, seg_clust*clust_sz+phy_sz, WRITE);
dev->buf_size = 0;
// Don't care about adjusting nextfreeseg because we're freeing the device
}
if (dev->gd)
{
del_gendisk(dev->gd);
put_disk(dev->gd);
blk_cleanup_queue(dev->queue);
}
if (dev->blkdev)
{
invalidate_mapping_pages(dev->blkdev->bd_inode->i_mapping, 0, -1);
blkdev_put(dev->blkdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
}
if (dev->map)
vfree(dev->map);
if (dev->clust_map)
vfree(dev->clust_map);
if (dev->ver)
vfree(dev->ver);
if (dev->buf)
kfree(dev->buf);
kfree(dev);
}
static void __exit sftl_exit(void)
{
struct list_head *pos, *next;
/* Remove all devices */
list_for_each_safe(pos, next, &sftl_device_list)
{
struct sftl_dev *dev = list_entry(pos, typeof(*dev), list);
struct block_device *bdev = dev->blkdev;
INFO("%s: removing", dev->gd->disk_name);
sftl_free_device(dev);
sync_blockdev(bdev);
list_del(&dev->list);
}
unregister_blkdev(major_num, "sftl");
}
module_init(sftl_init);
module_exit(sftl_exit);
static void bio_map_kern_endio(struct bio *bio, int err)
{
bio_put(bio);
}
// Copy-pasted from fs/bio.c
static struct bio *__bio_map_kern(struct request_queue *q, void *data,
unsigned int len, gfp_t gfp_mask)
{
unsigned long kaddr = (unsigned long)data;
unsigned long end = (kaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
unsigned long start = kaddr >> PAGE_SHIFT;
const int nr_pages = end - start;
int offset, i;
struct bio *bio;
bio = bio_kmalloc(gfp_mask, nr_pages);
if (!bio)
return ERR_PTR(-ENOMEM);
offset = offset_in_page(kaddr);
for (i = 0; i < nr_pages; i++) {
unsigned int bytes = PAGE_SIZE - offset;
if (len <= 0)
break;
if (bytes > len)
bytes = len;
if (bio_add_pc_page(q, bio, virt_to_page(data), bytes,
offset) < bytes)
break;
data += bytes;
len -= bytes;
offset = 0;
}
bio->bi_end_io = bio_map_kern_endio;
return bio;
}
struct bio_seq
{
atomic_t count;
void *private;
bio_end_io_t *endio;
int err;
};
static void bio_map_kern_seq_endio(struct bio *bio, int err)
{
struct bio_seq *seq = bio->bi_private;
if (err)
{
INFO("I/O error %d", err);
seq->err = err;
}
if (atomic_dec_and_test(&seq->count))
{
bio->bi_private = seq->private;
seq->endio(bio, seq->err);
kfree(seq);
}
else
bio_put(bio);
}
/**
* Generates and submits a sequence of 1 or more BIO's. @endio
* will be called after ALL of them will finish.
*
* @bdev: block device
* @data: data buffer
* @len: total length, can exceed @bdev queue limit
* @gfp_mask: mask to use when allocating memory
* @sector: starting sector to write at
* @private: @endio will see this value at bio->bi_private when called
* @endio: normal bio endio callback
* @rw: READ or WRITE
*/
static long bio_submit_kern_seq(
struct block_device *bdev, void *data, unsigned int len, gfp_t gfp_mask,
sector_t sector, void *private, bio_end_io_t *endio, int rw)
{
struct request_queue *q = bdev_get_queue(bdev);
struct bio *bio = __bio_map_kern(q, data, len, gfp_mask);
if (IS_ERR(bio))
{
return PTR_ERR(bio);
}
bio->bi_sector = sector;
bio->bi_bdev = bdev;
if (bio->bi_size < len)
{
struct bio_seq *seq = kmalloc(sizeof(struct bio_seq), gfp_mask);
int n = 1;
bio->bi_private = NULL;
bio->bi_end_io = bio_map_kern_seq_endio;
seq->err = 0;
seq->endio = endio;
seq->private = private;
data += bio->bi_size;
len -= bio->bi_size;
sector += bio->bi_size >> 9;
while (len > 0)
{
struct bio *bio2 = __bio_map_kern(q, data, len, gfp_mask);
if (IS_ERR(bio2))
{
// Free previously allocated bio's
kfree(seq);
while (bio)
{
struct bio *t = bio->bi_private;
bio_put(bio);
bio = t;
}
return PTR_ERR(bio2);
}
bio2->bi_bdev = bdev;
bio2->bi_sector = sector;
bio2->bi_private = bio;
bio2->bi_end_io = bio_map_kern_seq_endio;
data += bio2->bi_size;
len -= bio2->bi_size;
sector += bio2->bi_size >> 9;
bio = bio2;
n++;
}
atomic_set(&seq->count, n);
while (bio)
{
struct bio *t = bio->bi_private;
bio->bi_private = seq;
submit_bio(rw, bio);
bio = t;
}
}
else
{
bio->bi_private = private;
bio->bi_end_io = endio;
submit_bio(rw, bio);
}
return 0;
}
static void endFunc_tryKM2(struct bio *bb, int err)
{
if (bb->bi_private)
{
complete((struct completion*)(bb->bi_private));
}
bio_put(bb);
}
static void sync_io(struct block_device *bdev, sector_t sector, void *buf, unsigned len, int rw)
{
DECLARE_COMPLETION_ONSTACK(waithandle);
int err = bio_submit_kern_seq(bdev, buf, len, GFP_KERNEL, sector, &waithandle, endFunc_tryKM2, rw);
if (err)
{
INFO("I/O error %d", err);
return;
}
wait_for_completion(&waithandle);
}
static void read_maps(struct sftl_dev *dev)
{
struct sftl_map *buf = kmalloc(phy_sz, GFP_KERNEL);
int i, j, seg;
u32 max_first = 0, max_free = 0;
u32 cur_first = 0, cur_free = 0;
u32 dev_clusters = dev->size/clust_blocks;
INFO("reading translation maps");
for (i = 0; i < dev->segs; i++)
{
sync_io(dev->blkdev, (i+1)*(seg_clust*clust_blocks+1) - 1, buf, phy_sz, READ);
for (seg = 1, j = 0; j < seg_clust; j++)
{
if (buf[j].magic == magic && buf[j].checksum == sftl_map_checksum(buf[j]) &&
dev->ver[i*seg_clust+j] < buf[j].ver &&
buf[j].block < dev_clusters)
{
dev->map[buf[j].block] = i*seg_clust+j;
dev->ver[buf[j].block] = buf[j].ver;
dev->clust_map[i*seg_clust+j] = buf[j].block;
seg = 0;
}
else
{
dev->freeclust++;
}
}
if (seg)
{
// Segment is free
dev->freesegs++;
if (!cur_free)
{
cur_first = i;
cur_free = 1;
}
else
{
cur_free++;
}
}
else if (cur_free)
{
// Free segment sequence ended
// Remember max free sequence - writes will go there
if (cur_free > max_free)
{
max_first = cur_first;
max_free = cur_free;
}
cur_free = 0;
}
}
if (dev->freesegs < dev->reserved_segs)
{
// It's impossible to really occupy all clusters
BUG_ON(dev->freeclust < dev->reserved_segs * seg_clust);
INFO("All free space is fragmented on the device");
// FIXME: Need to defragment free space on the device...
}
// We'll start writing into a free segment
dev->free_start = (cur_free > max_free ? cur_first : max_first);
dev->free_size = (cur_free > max_free ? cur_free : max_free);
INFO("Next free segment = %d", dev->nextfreeseg);
kfree(buf);
}
static struct sftl_dev *add_device(char *devname)
{
const fmode_t mode = FMODE_READ | FMODE_WRITE | FMODE_EXCL;
struct block_device *bdev;
struct sftl_dev *dev;
int error, index;
uint64_t t;
uint64_t allocated_memory = 0;
if (!major_num)
sftl_reg_major();
if (!devname)
return NULL;
dev = kzalloc(sizeof(struct sftl_dev), GFP_KERNEL);
if (!dev)
return NULL;
/* Get a handle on the underlying device */
bdev = blkdev_get_by_path(devname, mode, dev);
if (IS_ERR(bdev))
{
ERROR("cannot open device %s", devname);
goto devinit_err;
}
dev->blkdev = bdev;
/* if (MAJOR(bdev->bd_dev) == major_num)
{
ERROR("attempting to translate an SFTL device");
goto devinit_err;
}*/
mutex_init(&dev->write_mutex);
t = get_capacity(dev->blkdev->bd_disk);
do_div(t, seg_clust*clust_blocks + 1);
dev->segs = t;
dev->reserved_segs = seg_clust;
if (dev->segs <= dev->reserved_segs)
{
ERROR("device %s is too small (%lu blocks); size should be no less than %lu blocks",
devname, (unsigned long)get_capacity(dev->blkdev->bd_disk), (unsigned long)((dev->reserved_segs+1) * (seg_clust*clust_blocks + 1)));
goto devinit_err;
}
dev->size = (dev->segs-dev->reserved_segs) * seg_clust * clust_blocks;
dev->map = vmalloc(sizeof(u32) * (dev->segs-dev->reserved_segs) * seg_clust);
dev->ver = vmalloc(sizeof(u32) * (dev->segs-dev->reserved_segs) * seg_clust);
memset(dev->ver, 0, sizeof(u32) * (dev->segs-dev->reserved_segs) * seg_clust);
dev->clust_map = vmalloc(sizeof(u32) * dev->segs * seg_clust);
memset(dev->clust_map, 0, sizeof(u32) * dev->segs * seg_clust);
allocated_memory = sizeof(u32) * seg_clust * (dev->segs*3 - dev->reserved_segs*2);
dev->buf = kzalloc(seg_clust*clust_sz + phy_sz, GFP_KERNEL);
dev->buf_max = seg_clust;
/* Get a request queue */
dev->queue = blk_alloc_queue(GFP_KERNEL);
if (!dev->queue)
goto devinit_err;
blk_queue_make_request(dev->queue, sftl_make_request);
dev->queue->queuedata = dev;
/* FIXME: It's OK when PAGE_SIZE==clust_sz==4096
but we should ALWAYS support bio of size==PAGE_SIZE */
blk_queue_max_hw_sectors(dev->queue, clust_blocks);
blk_queue_logical_block_size(dev->queue, clust_sz);
/* Allocate index for the new disk */
do {
if (!ida_pre_get(&sftl_index_ida, GFP_KERNEL))
goto devinit_err;
spin_lock(&sftl_index_lock);
error = ida_get_new(&sftl_index_ida, &index);
spin_unlock(&sftl_index_lock);
} while (error == -EAGAIN);
/* Initialise gendisk structure */
dev->gd = alloc_disk(16);
if (!dev->gd)
goto devinit_err;
dev->gd->major = major_num;
dev->gd->first_minor = index*16;
dev->gd->fops = &sftl_ops;
dev->gd->private_data = dev;
snprintf(dev->gd->disk_name, 32, "sftl%d", index);
set_capacity(dev->gd, dev->size);
dev->gd->queue = dev->queue;
/* Read maps from the device */
read_maps(dev);
/* Add disk */
add_disk(dev->gd);
list_add(&dev->list, &sftl_device_list);
INFO("%s: translating %s; %d sectors; %d free; used %d kb RAM for mappings", dev->gd->disk_name,
devname, dev->size, (dev->freeclust - dev->reserved_segs*seg_clust)*clust_blocks, (int)(allocated_memory >> 10));
return dev;
devinit_err:
sftl_free_device(dev);
return NULL;
}
static inline void kill_final_newline(char *str)
{
char *newline = strrchr(str, '\n');
if (newline && !newline[1])
*newline = 0;
}
static int sftl_setup(const char *val, struct kernel_param *kp)
{
char buf[80 + 12];
char *str = buf;
char *token[2];
char *name;
int i;
if (strnlen(val, sizeof(buf)) >= sizeof(buf))
{
ERROR("parameter too long");
return 0;
}
strcpy(str, val);
kill_final_newline(str);
for (i = 0; i < 1; i++)
token[i] = strsep(&str, ",");
if (str)
{
ERROR("too much parameters");
return 0;
}
if (!token[0])
{
ERROR("underlying block device name missing");
return 0;
}
name = token[0];
if (strlen(name) + 1 > 80)
{
ERROR("device name too long");
return 0;
}
add_device(name);
return 0;
}
module_param_call(dev, sftl_setup, NULL, NULL, 0200);
MODULE_PARM_DESC(dev, "Block device to translate. \"dev=<dev>\"");