块设备与字符设备
块设备只能以块为单位接收输入和返回输出,而字符设备则以字节为单位。大多数设备是字符设备,因为它们不需要缓冲而且不以固定块大小进行操作;字符设备只能被顺序读写,而块设备可以随机访问。
块设备对于I/O请求有对应的缓冲区,因此它们可以选择以什么顺序进行响应,字符设备无须缓冲且被直接读写。对于存储设备而言,调整读写的顺序作用巨大,因为在读写连续的扇区的存储速度比分离的扇区更快。
所有的EXT4、UBIFS、原始块设备又都工作于VFS之下,而EXT4、UBIFS、原始块设备之下又包含块I/O调度层以进行排序和合并。
I/O调度层的基本目的是将请求按照它们对应在块设备上的扇区号进行排列,以减少磁头的移动,提高效率。
块设备驱动结构
block_device,gendisk
1. 对于块设备上已经打开的每个分区,都对应一个block_device的实例;
2. 对应于分区的block_device示例通过bd_contains关联到对应于整个块设备的block_device示例;
3. 所用的block_device通过bd_disk,指向其对应的通用磁盘数据结构gendisk,这里的情况是对于一个块设备,多个分区但是只有一个gendisk实例。
4. gendisk实例中part成员指向hd_struct指针的数组,每个数组项都表示一个分区,如果一个block_device表示一个分区,其中包含了一个指针指向所述的hd_struct,hd_struct实例在gendisk和block_devicce之间是共享的
block_device在/dev下呈现的就是类似sd1,sdb2等块设备节点
struct block_device {
sector_t bd_start_sect;
sector_t bd_nr_sectors;
struct gendisk * bd_disk;
struct request_queue * bd_queue;
struct disk_stats __percpu *bd_stats;
unsigned long bd_stamp;
atomic_t __bd_flags; // partition number + flags
#define BD_PARTNO 255 // lower 8 bits; assign-once
#define BD_READ_ONLY (1u<<8) // read-only policy
#define BD_WRITE_HOLDER (1u<<9)
#define BD_HAS_SUBMIT_BIO (1u<<10)
#define BD_RO_WARNED (1u<<11)
#ifdef CONFIG_FAIL_MAKE_REQUEST
#define BD_MAKE_IT_FAIL (1u<<12)
#endif
dev_t bd_dev;
struct address_space *bd_mapping; /* page cache */
atomic_t bd_openers;
spinlock_t bd_size_lock; /* for bd_inode->i_size updates */
void * bd_claiming;
void * bd_holder;
const struct blk_holder_ops *bd_holder_ops;
struct mutex bd_holder_lock;
int bd_holders;
struct kobject *bd_holder_dir;
atomic_t bd_fsfreeze_count; /* number of freeze requests */
struct mutex bd_fsfreeze_mutex; /* serialize freeze/thaw */
struct partition_meta_info *bd_meta_info;
int bd_writers;
/*
* keep this out-of-line as it's both big and not needed in the fast
* path
*/
struct device bd_device;
} __randomize_layout;
block_device_operations 结构体是对块设备操作的集合;跟字符设备的file_operations结构体类似,在块设备驱动中进行初始化
struct block_device_operations {
int (*open) (struct block_device *, fmode_t);
void (*release) (struct gendisk *, fmode_t);
int (*rw_page)(struct block_device *, sector_t, struct page *, bool);
int (*ioctl) (struct block_device *, fmode_t, unsigned, unsigned long);
int (*compat_ioctl) (struct block_device *, fmode_t, unsigned, unsigned long);
unsigned int (*check_events) (struct gendisk *disk,
unsigned int clearing);
/* ->media_changed() is DEPRECATED, use ->check_events() instead */
int (*media_changed) (struct gendisk *);
void (*unlock_native_capacity) (struct gendisk *);
int (*revalidate_disk) (struct gendisk *);
int (*getgeo)(struct block_device *, struct hd_geometry *);
/* this callback is with swap_lock and sometimes page table lock held */
void (*swap_slot_free_notify) (struct block_device *, unsigned long);
struct module *owner;
const struct pr_ops *pr_ops;
};
gendisk结构体来表示一个独立的磁盘设备(或分区)
1.同一个磁盘的各个分区共享一个主设备号,而次设备号则不同;
2.fops为block_device_operations,即上节描述的块设备操作集合
3.queue是内核用来管理这个设备的I/O请求队列的指针
struct gendisk {
/* major, first_minor and minors are input parameters only,
* don't use directly. Use disk_devt() and disk_max_parts().
*/
int major; /* major number of driver */
int first_minor;
int minors; /* maximum number of minors, =1 for
* disks that can't be partitioned. */
char disk_name[DISK_NAME_LEN]; /* name of major driver */
char *(*devnode)(struct gendisk *gd, umode_t *mode);
unsigned int events; /* supported events */
unsigned int async_events; /* async events, subset of all */
/* Array of pointers to partitions indexed by partno.
* Protected with matching bdev lock but stat and other
* non-critical accesses use RCU. Always access through
* helpers.
*/
struct disk_part_tbl __rcu *part_tbl;
struct hd_struct part0;
const struct block_device_operations *fops;
struct request_queue *queue;
void *private_data;
int flags;
struct kobject *slave_dir;
struct timer_rand_state *random;
atomic_t sync_io; /* RAID */
struct disk_events *ev;
#ifdef CONFIG_BLK_DEV_INTEGRITY
struct kobject integrity_kobj;
#endif /* CONFIG_BLK_DEV_INTEGRITY */
int node_id;
struct badblocks *bb;
};
相关API
分配gendisk :struct gendisk *alloc_disk(int minors);
增加gendisk :gendisk结构体被分配之后,系统还不能使用这个磁盘,需要调用如下函数来注册这个磁盘设备 :void add_disk(struct gendisk *disk);
释放gendisk :void del_gendisk(struct gendisk *gp);
gendisk引用计数 :struct kobject *get_disk(struct gendisk *disk); void put_disk(struct gendisk *disk);
bio、request和request_queue
通常一个bio对应上层传递给块层的I/O请求。每个bio结构体实例及其包含的bvec_iter、bio_vec结构体实例描述了该I/O请求的开始扇区、数据方向、数据放入的页
struct bio {
struct bio *bi_next; /* request queue link */
struct gendisk *bi_disk;
unsigned int bi_opf; /* bottom bits req flags,
* top bits REQ_OP. Use
* accessors.
*/
unsigned short bi_flags; /* status, etc and bvec pool number */
unsigned short bi_ioprio;
unsigned short bi_write_hint;
blk_status_t bi_status;
u8 bi_partno;
/* Number of segments in this BIO after
* physical address coalescing is performed.
*/
unsigned int bi_phys_segments;
/*
* To keep track of the max segment size, we account for the
* sizes of the first and last mergeable segments in this bio.
*/
unsigned int bi_seg_front_size;
unsigned int bi_seg_back_size;
struct bvec_iter bi_iter;
atomic_t __bi_remaining;
bio_end_io_t *bi_end_io;
void *bi_private;
#ifdef CONFIG_BLK_CGROUP
/*
* Optional ioc and css associated with this bio. Put on bio
* release. Read comment on top of bio_associate_current().
*/
struct io_context *bi_ioc;
struct cgroup_subsys_state *bi_css;
#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
void *bi_cg_private;
struct blk_issue_stat bi_issue_stat;
#endif
#endif
union {
#if defined(CONFIG_BLK_DEV_INTEGRITY)
struct bio_integrity_payload *bi_integrity; /* data integrity */
#endif
};
unsigned short bi_vcnt; /* how many bio_vec's */
/*
* Everything starting with bi_max_vecs will be preserved by bio_reset()
*/
unsigned short bi_max_vecs; /* max bvl_vecs we can hold */
atomic_t __bi_cnt; /* pin count */
struct bio_vec *bi_io_vec; /* the actual vec list */
struct bio_set *bi_pool;
/*
* We can inline a number of vecs at the end of the bio, to avoid
* double allocations for a small number of bio_vecs. This member
* MUST obviously be kept at the very end of the bio.
*/
struct bio_vec bi_inline_vecs[0];
};
struct bvec_iter {
sector_t bi_sector; /* device address in 512 byte
sectors */
unsigned int bi_size; /* residual I/O count */
unsigned int bi_idx; /* current index into bvl_vec */
unsigned int bi_done; /* number of bytes completed */
unsigned int bi_bvec_done; /* number of bytes completed in
current bvec */
};
与bio对应的数据每次存放的内存不一定是连续的,bio_vec结构体用来描述与这个bio请求对应的所有的内存,它可能不总是在一个页面里面,因此需要一个向量
struct bio_vec {
struct page *bv_page;
unsigned int bv_len;
unsigned int bv_offset;
};
I/O调度算法可将连续的bio合并成一个请求。请求是bio经由I/O调度进行调整后的结果,这是请求和bio的区别。因此,一个request可以包含多个bio。当bio被提交给I/O调度器时,I/O调度器可能会将这个bio插入现存的请求中,也可能生成新的请求。
每个块设备或者块设备的分区都对应有自身的request_queue,从I/O调度器合并和排序出来的请求会被分发(Dispatch)到设备级的request_queue
相关API
初始化请求队列
request_queue_t *blk_init_queue(request_fn_proc *rfn, spinlock_t *lock);
清除请求队列
void blk_cleanup_queue(request_queue_t * q);
分配请求队列
request_queue_t *blk_alloc_queue(int gfp_mask);
对于RAMDISK这种完全随机访问的非机械设备,并不需要进行复杂的I/O调度,这个时候,可以直接“踢开”I/O调度器,使用如下函数来绑定请求队列和“制造请求”函数(make_request_fn)
void blk_queue_make_request(request_queue_t * q, make_request_fn * mfn);
结合起来使用的逻辑一般是
xxx_queue = blk_alloc_queue(GFP_KERNEL);
blk_queue_make_request(xxx_queue, xxx_make_request);
提取请求
struct request * blk_peek_request(struct request_queue *q);
用于返回下一个要处理的请求(由I/O调度器决定),如果没有请求则返回NULL。它不会清除请求,而是仍然将这个请求保留在队列上
启动请求
void blk_start_request(struct request *req);
从请求队列中移除请求
可以考虑使用blk_fetch_request;它同时做完了blk_peek_request()和blk_start_request()的工作
遍历bio和片段
__rq_for_each_bio()遍历一个请求的所有bio
#define __rq_for_each_bio(_bio, rq) \
if ((rq->bio)) \
for (_bio = (rq)->bio; _bio; _bio = _bio->bi_next)
bio_for_each_segment()遍历一个bio的所有bio_vec
#define __bio_for_each_segment(bvl, bio, iter, start) \
for (iter = (start); \
(iter).bi_size && \
((bvl = bio_iter_iovec((bio), (iter))), 1); \
bio_advance_iter((bio), &(iter), (bvl).bv_len))
rq_for_each_segment()迭代遍历一个请求所有bio中的所有segment
#define rq_for_each_segment(bvl, _rq, _iter) \
__rq_for_each_bio(_iter.bio, _rq) \
bio_for_each_segment(bvl, _iter.bio, _iter.iter)
报告完成
void blk_end_request_all(struct request *rq, int error);
若我们用blk_queue_make_request()绕开I/O调度,但是在bio处理完成后应该使用bio_endio()函数通知处理结束
void bio_endio(struct bio *bio, int error);
I/O调度器
Noop I/O调度器是一个简化的调度程序,该算法实现了一个简单FIFO队列,它只进行最基本的合并,比较适合基于Flash的存储器
Anticipatory I/O调度器算法推迟I/O请求,以期能对它们进行排序,获得最高的效率。在每次处理完读请求之后,不是立即返回,而是等待几个微秒。在这段时间内,任何来自临近区域的请求都被立即执行。超时以后,继续原来的处理
Deadline I/O调度器是针对Anticipatory I/O调度器的缺点进行改善而得来的,它试图把每次请求的延迟降至最低,该算法重排了请求的顺序来提高性能
CFQ I/O调度器为系统内的所有任务分配均匀的I/O带宽,提供一个公平的工作环境,一般作为默认的调度器
块设备驱动注册
块设备驱动要注册它们自己到内核,申请设备号,完成这个任务的函数是int register_blkdev(unsigned int major, const char *name);major参数是块设备要使用的主设备号,name为设备名,它会显示在/proc/devices中。如果major为0,内核会自动分配一个新的主设备号,register_blkdev()函数的返回值就是这个主设备号;除此之外,在块设备驱动初始化过程中,通常需要完成分配、初始化请求队列,绑定请求队列和请求处理函数的工作,并且可能会分配、初始化gendisk,给gendisk的major、fops、queue等成员赋值,最后添加gendisk。
模板
模板如下,比如drivers/mmc/card/block.c简化后就是这样的上述流程
static int xxx_init(void)
{
/* 块设备驱动注册 */
if (register_blkdev(XXX_MAJOR,xxx")) {
err = -EIO;
goto out;
}
/* 请求队列初始化 */
xxx_queue = blk_init_queue(xxx_request, xxx_lock);
if (!xxx_queue)
goto out_queue;
blk_queue_max_hw_sectors(xxx_queue, 255);
blk_queue_logical_block_size(xxx_queue, 512);
/* gendisk初始化 */
xxx_disks->major = XXX_MAJOR;
xxx_disks->first_minor = 0;
xxx_disks->fops = &xxx_op;
xxx_disks->queue = xxx_queue;
sprintf(xxx_disks->disk_name,"xxx%d", i);
set_capacity(xxx_disks, xxx_size *2);
add_disk(xxx_disks); /* 添加gendisk */
return 0;
out_queue: unregister_blkdev(XXX_MAJOR,"xxx");
out: put_disk(xxx_disks);
blk_cleanup_queue(xxx_queue);
return -ENOMEM;
}
实例
vmem_disk驱动的模块加载函数完成的上述给出的模板完全一致,它支持制造请求、请求队列两种模式
static void setup_device(struct vmem_disk_dev *dev, int which)
{
memset (dev, 0, sizeof (struct vmem_disk_dev));
dev->size = NSECTORS*HARDSECT_SIZE;
dev->data = vmalloc(dev->size);
if (dev->data == NULL) {
printk (KERN_NOTICE "vmalloc failure.\n");
return;
}
spin_lock_init(&dev->lock);
/*
* The I/O queue, depending on whether we are using our own
* make_request function or not.
*/
switch (request_mode) {
case VMEMD_NOQUEUE:
dev->queue = blk_alloc_queue(GFP_KERNEL);
if (dev->queue == NULL)
goto out_vfree;
blk_queue_make_request(dev->queue, vmem_disk_make_request);
break;
default:
printk(KERN_NOTICE "Bad request mode %d, using simple\n", request_mode);
case VMEMD_QUEUE:
dev->queue = blk_init_queue(vmem_disk_request, &dev->lock);
if (dev->queue == NULL)
goto out_vfree;
break;
}
blk_queue_logical_block_size(dev->queue, HARDSECT_SIZE);
dev->queue->queuedata = dev;
dev->gd = alloc_disk(VMEM_DISK_MINORS);
if (!dev->gd) {
printk (KERN_NOTICE "alloc_disk failure\n");
goto out_vfree;
}
dev->gd->major = vmem_disk_major;
dev->gd->first_minor = which*VMEM_DISK_MINORS;
dev->gd->fops = &vmem_disk_ops;
dev->gd->queue = dev->queue;
dev->gd->private_data = dev;
snprintf (dev->gd->disk_name, 32,"vmem_disk%c", which + 'a');
set_capacity(dev->gd, NSECTORS*(HARDSECT_SIZE/KERNEL_SECTOR_SIZE));
add_disk(dev->gd);
return;
out_vfree:
if (dev->data)
vfree(dev->data);
}
static int __init vmem_disk_init(void)
{
int i;
vmem_disk_major = register_blkdev(vmem_disk_major,"vmem_disk");
if (vmem_disk_major <= 0) {
printk(KERN_WARNING "vmem_disk: unable to get major number\n");
return -EBUSY;
}
devices = kmalloc(NDEVICES*sizeof (struct vmem_disk_dev), GFP_KERNEL);
if (!devices)
goto out_unregister;
for (i = 0; i < NDEVICES; i++)
setup_device(devices + i, i);
return 0;
out_unregister:
unregister_blkdev(vmem_disk_major,"sbd");
return -ENOMEM;
}
module_init(vmem_disk_init);
io请求处理
1. vmem_disk_transfer()完成真实的硬件I/O操作;
2. vmem_disk_xfer_bio()函数调用它来完成一个与bio对应的硬件操作(bio_for_each_segment()展开了该bio中的每个segment);
3. vmem_disk_make_request()直接调用vmem_disk_xfer_bio()来完成一个bio操作;
4. vmem_disk_request()则通过第47行的blk_peek_request()先从request_queue拿出一个请求,再通过__rq_for_each_bio()从该请求中取出一个bio,之后调用vmem_disk_xfer_bio()来完成该I/O请求。
/*
* Handle an I/O request.
*/
static void vmem_disk_transfer(struct vmem_disk_dev *dev, unsigned long sector,
unsigned long nsect, char *buffer, int write)
{
unsigned long offset = sector*KERNEL_SECTOR_SIZE;
unsigned long nbytes = nsect*KERNEL_SECTOR_SIZE;
if ((offset + nbytes) > dev->size) {
printk (KERN_NOTICE "Beyond-end write (%ld %ld)\n", offset, nbytes);
return;
}
if (write)
memcpy(dev->data + offset, buffer, nbytes);
else
memcpy(buffer, dev->data + offset, nbytes);
}
/*
* Transfer a single BIO.
*/
static int vmem_disk_xfer_bio(struct vmem_disk_dev *dev, struct bio *bio)
{
struct bio_vec bvec;
struct bvec_iter iter;
sector_t sector = bio->bi_iter.bi_sector;
bio_for_each_segment(bvec, bio, iter) {
char *buffer = __bio_kmap_atomic(bio, iter);
vmem_disk_transfer(dev, sector, bio_cur_bytes(bio) >> 9,
buffer, bio_data_dir(bio) == WRITE);
sector += bio_cur_bytes(bio) >> 9;
__bio_kunmap_atomic(buffer);
}
return 0;
}
/*
* The request_queue version.
*/
static void vmem_disk_request(struct request_queue *q)
{
struct request *req;
struct bio *bio;
while ((req = blk_peek_request(q)) != NULL) {
struct vmem_disk_dev *dev = req->rq_disk->private_data;
if (req->cmd_type != REQ_TYPE_FS) {
printk (KERN_NOTICE "Skip non-fs request\n");
blk_start_request(req);
__blk_end_request_all(req, -EIO);
continue;
}
blk_start_request(req);
__rq_for_each_bio(bio, req)
vmem_disk_xfer_bio(dev, bio);
__blk_end_request_all(req, 0);
}
}
/*
* The direct make request version.
*/
static void vmem_disk_make_request(struct request_queue *q, struct bio *bio)
{
struct vmem_disk_dev *dev = q->queuedata;
int status;
status = vmem_disk_xfer_bio(dev, bio);
bio_endio(bio, status);
}