sftl/sftl.c

923 lines
24 KiB
C
Raw Normal View History

/*
2013-05-09 03:25:03 +04:00
* Simple log-structured translation layer for FTLed flash drives
* like memory cards or USB sticks.
*
2013-05-18 03:45:11 +04:00
* (C) 2013 Vitaliy Filippov <vitalif at mail d0t ru>
2013-05-09 03:25:03 +04:00
* Redistributable under the terms of the GNU GPL 3.0+.
*/
#include <linux/module.h>
#include <linux/moduleparam.h>
#include <linux/init.h>
#include <linux/kernel.h> /* printk() */
#include <linux/fs.h> /* everything... */
#include <linux/errno.h> /* error codes */
#include <linux/types.h> /* size_t */
#include <linux/vmalloc.h>
#include <linux/genhd.h>
2013-05-09 16:26:37 +04:00
#include <linux/idr.h>
#include <linux/blkdev.h>
#include <linux/hdreg.h>
2013-06-02 02:22:47 +04:00
#include <linux/spinlock.h>
2013-05-09 03:25:03 +04:00
#define KERNEL_SECTOR_SIZE 512
2013-05-09 03:25:03 +04:00
#define ERROR(fmt, args...) printk(KERN_ERR "sftl: " fmt "\n" , ## args)
#define INFO(fmt, args...) printk(KERN_INFO "sftl: " fmt "\n" , ## args)
2013-05-09 03:25:03 +04:00
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Vitaliy Filippov <vitalif@mail.ru>");
MODULE_DESCRIPTION("Log-structured translation layer for USB sticks and memory cards");
2013-05-09 16:26:37 +04:00
MODULE_VERSION("0.1");
2013-05-09 03:25:03 +04:00
static int major_num = 0;
2013-05-18 03:45:11 +04:00
2013-05-19 17:22:48 +04:00
/* Cluster is a mapping unit
Segment is a sequence of clusters having a 1 separate sector for mapping metadata */
const u8 magic[] = "FtL";
2013-05-19 17:22:48 +04:00
const int phy_sz = 512; /* Kernel and physical sector/block size */
const int clust_sz = 4096; /* Cluster size in bytes */
const int clust_blocks = 4096/512; /* Cluster size in blocks (8) */
const int seg_clust = 512/16; /* Segment size in clusters (32) */
2013-05-09 16:26:37 +04:00
/* Mapping element */
struct __attribute__((__packed__)) sftl_map {
u8 magic[3];
u8 is_erased;
u32 block, ver, checksum;
2013-05-09 16:26:37 +04:00
};
2013-05-09 03:25:03 +04:00
2013-05-19 17:22:48 +04:00
/* Trivial checksum using some 32-bit prime number */
#define sftl_map_checksum(m) ((u32)((1+(m).block+((m).magic[0] | (m).magic[1]<<8))*(1+(m).ver+((m).magic[2])*((m).is_erased ? 227 : 1))*0xC4489EF5))
2013-05-18 03:45:11 +04:00
2013-05-09 03:25:03 +04:00
/* The internal representation of our device */
struct sftl_dev {
2013-05-19 20:34:32 +04:00
// Device parameters
2013-05-19 17:22:48 +04:00
u32 size; // device size in physical blocks
u32 segs; // device size in segments
u32 reserved_segs; // segments reserved for defragmentation during write
u32 *map; // virtual-to-real cluster map
2013-05-21 03:16:18 +04:00
u32 *clust_map; // real-to-virtual cluster map, value=(1+virtual cluster number), 0 means empty
2013-05-19 17:22:48 +04:00
u32 *ver; // cluster versions indexed by their virtual positions
u32 freeclust, freesegs; // free cluster count, free segment count
2013-05-19 20:34:32 +04:00
u32 free_start_seg; // starting segment of free segment sequence
u32 free_end_seg; // ending segment (end-start always >= @seg_clust-1 segments)
// Buffer to hold pending writes - will hold up to a complete segment starting at @free_start_seg
2013-05-19 20:34:32 +04:00
char *buf;
u32 buf_max, buf_size;
// Kernel objects
2013-06-02 02:22:47 +04:00
rwlock_t buffer_lock;
wait_queue_head_t flush_event;
struct gendisk *gd;
2013-05-09 03:25:03 +04:00
struct block_device *blkdev;
struct request_queue *queue;
struct list_head list;
};
2013-05-09 16:26:37 +04:00
/* Index allocator */
static DEFINE_SPINLOCK(sftl_index_lock);
static DEFINE_IDA(sftl_index_ida);
2013-05-09 03:25:03 +04:00
/* Our block device list, used in cleanup_module */
2013-05-09 16:26:37 +04:00
static LIST_HEAD(sftl_device_list);
static void sync_io(struct block_device *bdev, sector_t sector, void *buf, unsigned len, int rw);
static long bio_submit_kern_seq(
struct block_device *bdev, void *data, unsigned int len, gfp_t gfp_mask,
sector_t sector, void *private, bio_end_io_t *endio, int rw);
2013-05-19 20:34:32 +04:00
static void sftl_complete_seg(struct bio *bio, int err)
2013-05-09 16:26:37 +04:00
{
bio_endio((struct bio *)bio->bi_private, err);
bio_put(bio);
}
2013-05-20 03:18:55 +04:00
struct sftl_buf_info
{
struct bio *complete_bio;
void *free_buf;
};
static void sftl_complete_buf(struct bio *bio, int err)
{
struct sftl_buf_info *i = bio->bi_private;
bio_endio(i->complete_bio, err);
bio_put(bio);
kfree(i->free_buf);
kfree(i);
}
static void sftl_make_request(struct request_queue *q, struct bio *bio)
2013-05-09 16:26:37 +04:00
{
struct sftl_dev *sftl = (struct sftl_dev*)q->queuedata;
u32 cluster = bio->bi_sector/clust_blocks;
BUG_ON(bio->bi_vcnt > 1);
BUG_ON(bio->bi_sector % clust_blocks);
BUG_ON(bio->bi_size != clust_sz);
if (bio->bi_sector > sftl->size)
2013-05-09 16:26:37 +04:00
{
2013-05-19 17:22:48 +04:00
INFO("Beyond-end i/o (starting sector = %lu)", (unsigned long)bio->bi_sector);
bio_endio(bio, -EIO);
}
else if (!bio_rw(bio))
{
if (!sftl->ver[cluster])
2013-05-09 16:26:37 +04:00
{
// version=0 => unallocated cluster
zero_fill_bio(bio);
bio_endio(bio, 0);
}
else if (sftl->buf_size && sftl->map[cluster] >= sftl->free_start_seg*seg_clust
&& sftl->map[cluster] < sftl->free_start_seg*seg_clust + sftl->buf_size)
{
// written but not yet flushed cluster
char *buffer = __bio_kmap_atomic(bio, 0, KM_USER0);
memcpy(buffer, sftl->buf + clust_sz*(sftl->map[cluster] - sftl->free_start_seg*seg_clust), clust_sz);
__bio_kunmap_atomic(bio, KM_USER0);
bio_endio(bio, 0);
}
else
2013-05-09 16:26:37 +04:00
{
// cluster needs to be read from disk
2013-05-29 02:59:01 +04:00
u32 m = sftl->map[cluster];
struct block_device *bdev = sftl->blkdev;
struct request_queue *q = bdev_get_queue(bdev);
struct bio *bb = bio_alloc(GFP_KERNEL, 1);
if (IS_ERR(bb))
return;
bio_add_pc_page(q, bb, bio_page(bio), bio->bi_size, bio_offset(bio));
2013-05-19 17:22:48 +04:00
bb->bi_sector = m/seg_clust * (seg_clust*clust_blocks + 1) + (m%seg_clust)*clust_blocks;
bb->bi_bdev = bdev;
bb->bi_private = bio;
bb->bi_end_io = sftl_complete_seg;
submit_bio(READ, bb);
if (!(bb->bi_flags & (1 << BIO_UPTODATE)))
{
bio_put(bb);
bio_endio(bio, -EIO);
}
}
}
else
{
2013-06-01 02:49:24 +04:00
// R/W locking using 1 R/W spinlock and 1 event.
//
// Reading:
// * Take read lock
// * Check if requested cluster is mapped into buffer
// * If yes:
// ** Read from the buffer
// * If no:
// ** Initiate block read operation
// * Unlock
//
// Writing:
2013-05-31 03:29:50 +04:00
// (Start):
2013-06-01 02:49:24 +04:00
// * Take write lock
2013-05-31 03:29:50 +04:00
// * Check for free space in buffer
// * If sufficient:
// ** Write current bio into buffer
// ** Modify translation maps
// * If insufficient:
2013-06-01 02:49:24 +04:00
// ** (Insufficient) Check flush flag (no need for atomic/etc as already within buffer lock)
// ** If someone is already flushing:
2013-06-01 02:49:24 +04:00
// *** Unlock
2013-05-31 03:29:50 +04:00
// *** Wait until flushing ends using an event
// *** Goto (Start)
// ** If no one is flushing yet:
// *** Set flush flag
// *** Remember current bio and initiate (Flush) operation
2013-06-01 02:49:24 +04:00
// * Unlock
2013-05-31 03:29:50 +04:00
//
// After (Flush) operation ends:
2013-06-01 02:49:24 +04:00
// * Take write lock (writers are already blocked, this is to block readers)
2013-05-31 03:29:50 +04:00
// * Clear buffer
// * If the free sequence pointer can be moved without cleaning:
// ** Move pointer
// ** Perform own remembered write operation
// ** Unset flush flag
2013-06-01 02:49:24 +04:00
// ** Unlock
2013-05-31 03:29:50 +04:00
// ** Wake up waiting writers
// * If not:
// ** Initiate cleaning process
2013-06-01 02:49:24 +04:00
// ** Unlock
2013-05-31 03:29:50 +04:00
//
// After cleaning operation ends:
2013-06-01 02:49:24 +04:00
// * Take write lock
2013-05-31 03:29:50 +04:00
// * Modify translation maps
// * Move free sequence pointer
// * If there are no more pending cleaning operations:
// ** Perform own remembered write operation:
// *** Write current bio into buffer
// *** Modify translation maps
// ** Unset flush flag
2013-06-01 02:49:24 +04:00
// ** Unlock
// ** Wake up waiting writers
// * Else:
// ** Initiate next cleaning operation
2013-06-01 02:49:24 +04:00
// ** Unlock
2013-05-19 20:34:32 +04:00
struct sftl_map *buf_map = (struct sftl_map *)(sftl->buf + seg_clust*clust_sz) + sftl->buf_size;
char *buffer = __bio_kmap_atomic(bio, 0, KM_USER0);
memcpy(sftl->buf + clust_sz*sftl->buf_size, buffer, clust_sz);
__bio_kunmap_atomic(bio, KM_USER0);
buf_map->magic[0] = magic[0];
buf_map->magic[1] = magic[1];
buf_map->magic[2] = magic[2];
buf_map->is_erased = 0;
buf_map->block = cluster;
buf_map->ver = sftl->ver[cluster]+1;
2013-05-19 20:34:32 +04:00
buf_map->checksum = sftl_map_checksum(*buf_map);
sftl->map[cluster] = sftl->free_start_seg*seg_clust + sftl->buf_size;
sftl->clust_map[sftl->map[cluster]] = 1 + cluster;
sftl->ver[cluster] = buf_map->ver;
2013-05-19 20:34:32 +04:00
sftl->buf_size++;
2013-05-19 17:22:48 +04:00
INFO("Write request (starting sector = %lu, count = %lu)",
(unsigned long)bio->bi_sector, (unsigned long)bio_sectors(bio));
2013-05-19 20:34:32 +04:00
if (sftl->buf_size >= sftl->buf_max)
{
2013-05-20 03:18:55 +04:00
// Need to flush current buffer before completing this bio
void *buf = sftl->buf;
struct sftl_buf_info *info = kmalloc(sizeof(struct sftl_buf_info), GFP_KERNEL);
int err;
info->free_buf = buf;
info->complete_bio = bio;
// Just stupidly switch buffer (there will be no overflow)
2013-05-20 03:18:55 +04:00
sftl->buf = kmalloc(seg_clust*clust_sz + phy_sz, GFP_KERNEL);
sftl->buf_size = 0;
err = bio_submit_kern_seq(sftl->blkdev, buf, seg_clust*clust_sz+phy_sz, GFP_KERNEL,
sftl->free_start_seg*(seg_clust*clust_blocks+1), info, sftl_complete_buf, WRITE);
if (err)
2013-05-20 03:18:55 +04:00
{
2013-05-19 20:34:32 +04:00
bio_endio(bio, -EIO);
2013-05-29 02:59:01 +04:00
kfree(sftl->buf);
sftl->buf = buf;
2013-05-20 03:18:55 +04:00
kfree(info);
}
2013-05-21 03:16:18 +04:00
sftl->freeclust -= seg_clust;
sftl->freesegs--;
// FIXME Correctly adjust free segment address
sftl->free_start_seg++;
/*
Algorithm:
1) If less than reserved clusters are free on the device
=> This shouldn't happen. Abort writing.
2) If a "next free sequence" is already remembered, and there are
no free segments left in current free sequence
=> Switch free sequence to "next", write as usual
3) If more than N-1 free segments are left in current sequence,
or if a "next free sequence" is already remembered
=> Write as usual
4) Try to find a free sequence of N segments. If there is one
=> Remember it as a "next free sequence", write as usual
5) Try to find a freeable sequence of N segments. If there is one
=> Free it using current N-1 free segments, make it current
and write as usual
6) If there is no complete freeable sequence found
=> Move data from a segment adjacent to current free sequence
to random free clusters on the device.
This operation ensures that reserved segments are never fragmented.
It may fail if nearly ALL clusters are occupied on the device.
This is OK because we know that we'll definitely have at least N
free clusters on the device after writing any of the reserved segments.
*/
/*
BUG_ON(dev->freeclust < dev->reserved_segs*seg_clust);
if (sftl->next_free_end)
2013-05-21 03:16:18 +04:00
{
if (sftl->free_end_seg <= sftl->free_start_seg)
2013-05-21 03:16:18 +04:00
{
sftl->free_start_seg = sftl->next_free_start;
sftl->free_end_seg = sftl->next_free_end;
sftl->next_free_start = 0;
sftl->next_free_end = 0;
}
2013-05-21 03:16:18 +04:00
}
else if (sftl->free_end_seg - sftl->free_start_seg <= seg_clust-1)
2013-05-21 03:16:18 +04:00
{
// Search for a sequence of at least @seg_clust free segments
u32 i, j, cur_first = 0, cur_free = 0;
for (i = 0; i < sftl->segs; i++)
2013-05-21 03:16:18 +04:00
{
for (j = 0; j < seg_clust; j++)
{
if (sftl->clust_map[i*seg_clust+j])
{
break;
}
}
if (j == seg_clust)
{
if (cur_free)
{
cur_free++;
}
else
{
cur_first = i;
cur_free = 1;
}
}
else if (cur_free >= seg_clust)
{
break;
}
else
{
cur_free = 0;
}
}
if (cur_free)
{
// If found, remember as next and continue writing into current sequence
sftl->next_free_start = cur_first;
sftl->next_free_end = cur_first+cur_free;
}
else
{
// Search for a freeable sequence
u32 random_free[seg_clust], random_found = 0;
u32 min_freeable_start = 0, min_freeable_cost = seg_clust*seg_clust, cur_freeable_cost = 0;
for (i = 0; i < sftl->segs; i++)
{
for (j = 0; j < seg_clust; j++)
{
if (i >= seg_clust && sftl->clust_map[i*seg_clust+j - seg_clust*seg_clust])
{
cur_freeable--;
}
if (sftl->clust_map[i*seg_clust+j])
{
cur_freeable++;
}
else if (random_found < seg_clust)
{
random_free[random_found++] = i*seg_clust+j;
}
}
if (i >= seg_clust-1 && cur_freeable_cost < min_freeable_cost)
{
min_freeable_cost = cur_freeable_cost;
min_freeable_start = i-seg_clust+1;
}
}
if (min_freeable_cost < seg_clust*(seg_clust-1))
{
// Best freeable sequence found -> free it and continue writing
sftl->next_free_start = min_freeable_start;
sftl->next_free_end = min_freeable_start+seg_clust;
for (k = min_freeable_start*seg_clust, i = 0; i < seg_clust; i++)
{
for (j = 0; j < seg_clust; j++, k++)
{
if (sftl->clust_map[k])
{
READ(sftl, sftl->clust_map[k]-1, buf);
WRITE(sftl, sftl->clust_map[k]-1, buf);
}
}
}
}
else
{
// Move data into random free clusters
if (sftl->free_end_seg < sftl->segs)
{
next_seg = sftl->free_end_seg;
}
else
{
next_seg = sftl->free_start_seg-1;
}
for (j = 0, i = 0; i < seg_clust && j < random_found; i++)
{
if (sftl->clust_map[next_seg*seg_clust + i])
{
u32 mv = sftl->clust_map[next_seg*seg_clust + i]-1;
READ(sftl, mv, buf);
WRITE_SINGLE(sftl, mv, random_free[j++], buf);
}
}
if (i >= seg_clust)
2013-05-21 03:16:18 +04:00
{
// Adjacent segment freed!
sftl->freesegs++;
if (sftl->free_end_seg < sftl->segs)
{
sftl->free_end_seg++;
}
else
{
sftl->free_start_seg--;
}
2013-05-21 03:16:18 +04:00
}
}
}
}
*/
2013-05-19 20:34:32 +04:00
}
else
bio_endio(bio, 0);
}
}
/*
* The HDIO_GETGEO ioctl is handled in blkdev_ioctl(), which
* calls this. We need to implement getgeo, since we can't
* use tools such as fdisk to partition the drive otherwise.
*/
2013-05-09 16:26:37 +04:00
int sftl_getgeo(struct block_device * block_device, struct hd_geometry * geo)
{
2013-05-19 17:22:48 +04:00
geo->cylinders = (get_capacity(block_device->bd_disk) & ~0x3f) >> 6;
geo->heads = 4;
geo->sectors = 16;
geo->start = 0;
return 0;
}
/*
* The device operations structure.
*/
2013-05-09 03:25:03 +04:00
static struct block_device_operations sftl_ops = {
.owner = THIS_MODULE,
2013-05-09 16:26:37 +04:00
.getgeo = sftl_getgeo
};
2013-05-11 01:58:59 +04:00
static int sftl_reg_major(void)
2013-05-09 03:25:03 +04:00
{
2013-05-11 01:58:59 +04:00
if (!major_num)
2013-05-09 03:25:03 +04:00
{
2013-05-11 01:58:59 +04:00
/* Register major number */
major_num = register_blkdev(major_num, "sftl");
if (major_num < 0)
{
printk(KERN_WARNING "sftl: unable to get major number\n");
return -1;
}
}
return 0;
}
2013-05-11 01:58:59 +04:00
static int __init sftl_init(void)
{
return sftl_reg_major();
}
2013-05-09 16:26:37 +04:00
static void sftl_free_device(struct sftl_dev *dev)
{
if (!dev)
return;
2013-05-19 20:34:32 +04:00
if (dev->buf_size)
{
INFO("Flushing %d pending clusters", dev->buf_size);
sync_io(dev->blkdev, dev->free_start_seg*(seg_clust*clust_blocks+1), dev->buf, seg_clust*clust_sz+phy_sz, WRITE);
2013-05-19 20:34:32 +04:00
dev->buf_size = 0;
// Don't care about adjusting free_start_seg because we're freeing the device
2013-05-19 20:34:32 +04:00
}
2013-05-09 16:26:37 +04:00
if (dev->gd)
{
del_gendisk(dev->gd);
put_disk(dev->gd);
blk_cleanup_queue(dev->queue);
}
if (dev->blkdev)
{
invalidate_mapping_pages(dev->blkdev->bd_inode->i_mapping, 0, -1);
blkdev_put(dev->blkdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
}
if (dev->map)
vfree(dev->map);
2013-05-19 20:34:32 +04:00
if (dev->clust_map)
vfree(dev->clust_map);
2013-05-09 16:26:37 +04:00
if (dev->ver)
vfree(dev->ver);
2013-05-19 20:34:32 +04:00
if (dev->buf)
kfree(dev->buf);
2013-05-09 16:26:37 +04:00
kfree(dev);
}
2013-05-09 03:25:03 +04:00
static void __exit sftl_exit(void)
{
2013-05-09 16:26:37 +04:00
struct list_head *pos, *next;
/* Remove all devices */
list_for_each_safe(pos, next, &sftl_device_list)
{
struct sftl_dev *dev = list_entry(pos, typeof(*dev), list);
struct block_device *bdev = dev->blkdev;
INFO("%s: removing", dev->gd->disk_name);
2013-05-09 16:26:37 +04:00
sftl_free_device(dev);
sync_blockdev(bdev);
list_del(&dev->list);
2013-05-09 16:26:37 +04:00
}
2013-05-09 03:25:03 +04:00
unregister_blkdev(major_num, "sftl");
}
2013-05-09 03:25:03 +04:00
module_init(sftl_init);
module_exit(sftl_exit);
static void bio_map_kern_endio(struct bio *bio, int err)
{
bio_put(bio);
}
// Copy-pasted from fs/bio.c
static struct bio *__bio_map_kern(struct request_queue *q, void *data,
unsigned int len, gfp_t gfp_mask)
{
unsigned long kaddr = (unsigned long)data;
unsigned long end = (kaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
unsigned long start = kaddr >> PAGE_SHIFT;
const int nr_pages = end - start;
int offset, i;
struct bio *bio;
bio = bio_kmalloc(gfp_mask, nr_pages);
if (!bio)
return ERR_PTR(-ENOMEM);
offset = offset_in_page(kaddr);
for (i = 0; i < nr_pages; i++) {
unsigned int bytes = PAGE_SIZE - offset;
if (len <= 0)
break;
if (bytes > len)
bytes = len;
if (bio_add_pc_page(q, bio, virt_to_page(data), bytes,
offset) < bytes)
break;
data += bytes;
len -= bytes;
offset = 0;
}
bio->bi_end_io = bio_map_kern_endio;
return bio;
}
struct bio_seq
{
atomic_t count;
void *private;
bio_end_io_t *endio;
int err;
};
static void bio_map_kern_seq_endio(struct bio *bio, int err)
{
struct bio_seq *seq = bio->bi_private;
if (err)
{
2013-05-20 03:18:55 +04:00
INFO("I/O error %d", err);
seq->err = err;
}
if (atomic_dec_and_test(&seq->count))
{
bio->bi_private = seq->private;
seq->endio(bio, seq->err);
kfree(seq);
}
else
bio_put(bio);
}
/**
* Generates and submits a sequence of 1 or more BIO's. @endio
* will be called after ALL of them will finish.
*
* @bdev: block device
* @data: data buffer
* @len: total length, can exceed @bdev queue limit
* @gfp_mask: mask to use when allocating memory
* @sector: starting sector to write at
* @private: @endio will see this value at bio->bi_private when called
* @endio: normal bio endio callback
* @rw: READ or WRITE
*/
static long bio_submit_kern_seq(
struct block_device *bdev, void *data, unsigned int len, gfp_t gfp_mask,
sector_t sector, void *private, bio_end_io_t *endio, int rw)
{
struct request_queue *q = bdev_get_queue(bdev);
struct bio *bio = __bio_map_kern(q, data, len, gfp_mask);
if (IS_ERR(bio))
{
return PTR_ERR(bio);
}
bio->bi_sector = sector;
bio->bi_bdev = bdev;
if (bio->bi_size < len)
{
struct bio_seq *seq = kmalloc(sizeof(struct bio_seq), gfp_mask);
int n = 1;
bio->bi_private = NULL;
bio->bi_end_io = bio_map_kern_seq_endio;
2013-05-20 03:18:55 +04:00
seq->err = 0;
seq->endio = endio;
seq->private = private;
data += bio->bi_size;
len -= bio->bi_size;
sector += bio->bi_size >> 9;
while (len > 0)
{
struct bio *bio2 = __bio_map_kern(q, data, len, gfp_mask);
if (IS_ERR(bio2))
{
// Free previously allocated bio's
kfree(seq);
while (bio)
{
struct bio *t = bio->bi_private;
bio_put(bio);
bio = t;
}
return PTR_ERR(bio2);
}
bio2->bi_bdev = bdev;
bio2->bi_sector = sector;
bio2->bi_private = bio;
bio2->bi_end_io = bio_map_kern_seq_endio;
2013-05-20 03:18:55 +04:00
data += bio2->bi_size;
len -= bio2->bi_size;
sector += bio2->bi_size >> 9;
bio = bio2;
n++;
}
atomic_set(&seq->count, n);
while (bio)
{
struct bio *t = bio->bi_private;
bio->bi_private = seq;
submit_bio(rw, bio);
bio = t;
}
}
else
{
bio->bi_private = private;
bio->bi_end_io = endio;
submit_bio(rw, bio);
}
return 0;
}
2013-05-14 13:56:45 +04:00
static void endFunc_tryKM2(struct bio *bb, int err)
{
if (bb->bi_private)
{
complete((struct completion*)(bb->bi_private));
}
bio_put(bb);
2013-05-14 13:56:45 +04:00
}
static void sync_io(struct block_device *bdev, sector_t sector, void *buf, unsigned len, int rw)
2013-05-18 03:45:11 +04:00
{
DECLARE_COMPLETION_ONSTACK(waithandle);
int err = bio_submit_kern_seq(bdev, buf, len, GFP_KERNEL, sector, &waithandle, endFunc_tryKM2, rw);
if (err)
2013-05-18 03:45:11 +04:00
{
INFO("I/O error %d", err);
2013-05-18 03:45:11 +04:00
return;
}
wait_for_completion(&waithandle);
}
static void read_maps(struct sftl_dev *dev)
{
struct sftl_map *buf = kmalloc(phy_sz, GFP_KERNEL);
int i, j, seg;
2013-05-19 17:22:48 +04:00
u32 max_first = 0, max_free = 0;
u32 cur_first = 0, cur_free = 0;
u32 dev_clusters = dev->size/clust_blocks;
2013-05-18 03:45:11 +04:00
INFO("reading translation maps");
for (i = 0; i < dev->segs; i++)
{
2013-05-19 20:34:32 +04:00
sync_io(dev->blkdev, (i+1)*(seg_clust*clust_blocks+1) - 1, buf, phy_sz, READ);
2013-05-19 17:22:48 +04:00
for (seg = 1, j = 0; j < seg_clust; j++)
2013-05-18 03:45:11 +04:00
{
if (buf[j].magic[0] == magic[0] &&
buf[j].magic[1] == magic[1] &&
buf[j].magic[2] == magic[2] &&
buf[j].checksum == sftl_map_checksum(buf[j]) &&
2013-05-19 17:22:48 +04:00
dev->ver[i*seg_clust+j] < buf[j].ver &&
buf[j].block < dev_clusters)
2013-05-18 03:45:11 +04:00
{
2013-05-19 17:22:48 +04:00
dev->map[buf[j].block] = i*seg_clust+j;
dev->ver[buf[j].block] = buf[j].ver;
dev->clust_map[i*seg_clust+j] = buf[j].block;
2013-05-18 03:45:11 +04:00
seg = 0;
}
else
{
dev->freeclust++;
}
}
2013-05-19 17:22:48 +04:00
if (seg)
{
// Segment is free
dev->freesegs++;
if (!cur_free)
{
cur_first = i;
cur_free = 1;
}
else
{
cur_free++;
}
}
else if (cur_free)
{
// Free segment sequence ended
// Remember max free sequence - writes will go there
if (cur_free > max_free)
{
max_first = cur_first;
max_free = cur_free;
}
cur_free = 0;
}
}
if (dev->freesegs < dev->reserved_segs)
{
// It's impossible to really occupy all clusters
BUG_ON(dev->freeclust < dev->reserved_segs * seg_clust);
INFO("All free space is fragmented on the device");
// FIXME: Need to defragment free space on the device...
2013-05-18 03:45:11 +04:00
}
2013-05-19 17:22:48 +04:00
// We'll start writing into a free segment
dev->free_start_seg = (cur_free > max_free ? cur_first : max_first);
dev->free_end_seg = (cur_free > max_free ? cur_first+cur_free : max_first+max_free);
INFO("Current free segment sequence: (%d, %d)", dev->free_start_seg, dev->free_end_seg);
2013-05-18 03:45:11 +04:00
kfree(buf);
}
2013-05-09 03:25:03 +04:00
static struct sftl_dev *add_device(char *devname)
{
const fmode_t mode = FMODE_READ | FMODE_WRITE | FMODE_EXCL;
struct block_device *bdev;
struct sftl_dev *dev;
2013-05-09 16:26:37 +04:00
int error, index;
2013-05-11 01:58:59 +04:00
uint64_t t;
2013-05-19 17:32:31 +04:00
uint64_t allocated_memory = 0;
2013-05-11 01:58:59 +04:00
if (!major_num)
sftl_reg_major();
2013-05-09 03:25:03 +04:00
if (!devname)
return NULL;
dev = kzalloc(sizeof(struct sftl_dev), GFP_KERNEL);
if (!dev)
return NULL;
2013-05-09 16:26:37 +04:00
/* Get a handle on the underlying device */
2013-05-09 03:25:03 +04:00
bdev = blkdev_get_by_path(devname, mode, dev);
if (IS_ERR(bdev))
{
ERROR("cannot open device %s", devname);
goto devinit_err;
}
dev->blkdev = bdev;
/* if (MAJOR(bdev->bd_dev) == major_num)
{
2013-05-09 16:26:37 +04:00
ERROR("attempting to translate an SFTL device");
2013-05-09 03:25:03 +04:00
goto devinit_err;
}*/
2013-06-02 02:22:47 +04:00
rwlock_init(&dev->buffer_lock);
init_waitqueue_head(&dev->flush_event);
2013-05-09 03:25:03 +04:00
2013-05-19 17:22:48 +04:00
t = get_capacity(dev->blkdev->bd_disk);
do_div(t, seg_clust*clust_blocks + 1);
2013-05-18 03:45:11 +04:00
dev->segs = t;
2013-05-19 17:22:48 +04:00
dev->reserved_segs = seg_clust;
if (dev->segs <= dev->reserved_segs)
{
ERROR("device %s is too small (%lu blocks); size should be no less than %lu blocks",
devname, (unsigned long)get_capacity(dev->blkdev->bd_disk), (unsigned long)((dev->reserved_segs+1) * (seg_clust*clust_blocks + 1)));
goto devinit_err;
}
dev->size = (dev->segs-dev->reserved_segs) * seg_clust * clust_blocks;
dev->map = vmalloc(sizeof(u32) * (dev->segs-dev->reserved_segs) * seg_clust);
dev->ver = vmalloc(sizeof(u32) * (dev->segs-dev->reserved_segs) * seg_clust);
memset(dev->ver, 0, sizeof(u32) * (dev->segs-dev->reserved_segs) * seg_clust);
dev->clust_map = vmalloc(sizeof(u32) * dev->segs * seg_clust);
memset(dev->clust_map, 0, sizeof(u32) * dev->segs * seg_clust);
2013-05-19 17:32:31 +04:00
allocated_memory = sizeof(u32) * seg_clust * (dev->segs*3 - dev->reserved_segs*2);
2013-05-19 20:34:32 +04:00
dev->buf = kzalloc(seg_clust*clust_sz + phy_sz, GFP_KERNEL);
dev->buf_max = seg_clust;
2013-05-09 16:26:37 +04:00
/* Get a request queue */
dev->queue = blk_alloc_queue(GFP_KERNEL);
if (!dev->queue)
2013-05-09 03:25:03 +04:00
goto devinit_err;
blk_queue_make_request(dev->queue, sftl_make_request);
2013-05-09 16:42:47 +04:00
dev->queue->queuedata = dev;
/* FIXME: It's OK when PAGE_SIZE==clust_sz==4096
but we should ALWAYS support bio of size==PAGE_SIZE */
blk_queue_max_hw_sectors(dev->queue, clust_blocks);
2013-05-09 03:25:03 +04:00
blk_queue_logical_block_size(dev->queue, clust_sz);
2013-05-09 16:26:37 +04:00
/* Allocate index for the new disk */
do {
if (!ida_pre_get(&sftl_index_ida, GFP_KERNEL))
goto devinit_err;
spin_lock(&sftl_index_lock);
error = ida_get_new(&sftl_index_ida, &index);
spin_unlock(&sftl_index_lock);
} while (error == -EAGAIN);
/* Initialise gendisk structure */
2013-05-09 03:25:03 +04:00
dev->gd = alloc_disk(16);
if (!dev->gd)
goto devinit_err;
dev->gd->major = major_num;
2013-05-09 16:26:37 +04:00
dev->gd->first_minor = index*16;
2013-05-09 03:25:03 +04:00
dev->gd->fops = &sftl_ops;
dev->gd->private_data = dev;
2013-05-09 16:42:47 +04:00
snprintf(dev->gd->disk_name, 32, "sftl%d", index);
2013-05-09 03:25:03 +04:00
set_capacity(dev->gd, dev->size);
dev->gd->queue = dev->queue;
2013-05-09 16:26:37 +04:00
/* Read maps from the device */
2013-05-18 03:45:11 +04:00
read_maps(dev);
/* Add disk */
add_disk(dev->gd);
2013-05-09 16:26:37 +04:00
2013-05-09 03:25:03 +04:00
list_add(&dev->list, &sftl_device_list);
2013-05-19 17:32:31 +04:00
INFO("%s: translating %s; %d sectors; %d free; used %d kb RAM for mappings", dev->gd->disk_name,
devname, dev->size, (dev->freeclust - dev->reserved_segs*seg_clust)*clust_blocks, (int)(allocated_memory >> 10));
2013-05-09 03:25:03 +04:00
return dev;
devinit_err:
sftl_free_device(dev);
return NULL;
}
static inline void kill_final_newline(char *str)
{
char *newline = strrchr(str, '\n');
if (newline && !newline[1])
*newline = 0;
}
static int sftl_setup(const char *val, struct kernel_param *kp)
{
char buf[80 + 12];
char *str = buf;
char *token[2];
char *name;
2013-05-09 16:26:37 +04:00
int i;
2013-05-09 03:25:03 +04:00
if (strnlen(val, sizeof(buf)) >= sizeof(buf))
{
ERROR("parameter too long");
return 0;
}
strcpy(str, val);
kill_final_newline(str);
for (i = 0; i < 1; i++)
token[i] = strsep(&str, ",");
if (str)
{
ERROR("too much parameters");
return 0;
}
if (!token[0])
{
ERROR("underlying block device name missing");
return 0;
}
name = token[0];
if (strlen(name) + 1 > 80)
{
ERROR("device name too long");
return 0;
}
add_device(name);
return 0;
}
module_param_call(dev, sftl_setup, NULL, NULL, 0200);
2013-05-09 16:26:37 +04:00
MODULE_PARM_DESC(dev, "Block device to translate. \"dev=<dev>\"");