本文的源码版本是Linux 5.15版本,有图有真相:
1.先从块设备驱动说起
安卓平台有一个非常典型和重要的块设备驱动:zram,我们来看一下zram这个块设备驱动加载初始化和swapon的逻辑,完整梳理完这个逻辑将对Linux块设备驱动模型有深入的理解。
zram驱动加载的时候会调用zram_add函数,源码如下:
1887/*
1888 * Allocate and initialize new zram device. the function returns
1889 * '>= 0' device_id upon success, and negative value otherwise.
1890 */
1891static int zram_add(void)
1892{
1893 struct zram *zram;
1894 int ret, device_id;
1895
1896 zram = kzalloc(sizeof(struct zram), GFP_KERNEL);
1909 ...
1910 /* gendisk structure */
1911 zram->disk = blk_alloc_disk(NUMA_NO_NODE);
1
1918
1919 zram->disk->major = zram_major;
1920 zram->disk->first_minor = device_id;
1921 zram->disk->minors = 1;
1922 zram->disk->fops = &zram_devops;
1923 zram->disk->private_data = zram;
1924 snprintf(zram->disk->disk_name, 16, "zram%d", device_id);
1925 ...
1957 device_add_disk(NULL, zram->disk, zram_disk_attr_groups);
1961 zram_debugfs_register(zram);
1962 pr_info("Added device: %s\n", zram->disk->disk_name);
1963 return device_id;
...
1970}
zram_add中有两个非常重要的函数:
- blk_alloc_disk
- device_add_disk
上面两个函数描述了块设备驱动的两个步骤:1)创建gendisk对象,代表的是一个“磁盘” 2)注册和激活磁盘,激活磁盘之后就可以正式使用了。
2. gendisk和hd_struct是啥
blk_alloc_disk函数创建了一个gendisk对象,这就出现了本文要讲述的非常重要的对象。怎么理解gendisk呢?Linux用gendisk代表一个“磁盘”,这里的磁盘可以是一个真实的硬盘,也可以是一个虚拟设备。
我们接触windows系统比较多,硬盘都会划分分区,在Linux是不是也有同样的概念呢?确实如此,这就要去struct gendisk数据结构来一探究竟了:
121struct gendisk {
122 /* major, first_minor and minors are input parameters only,
123 * don't use directly. Use disk_devt() and disk_max_parts().
124 */
125 int major; /* major number of driver */
126 int first_minor;
127 int minors; /* maximum number of minors, =1 for
128 * disks that can't be partitioned. */
129
130 char disk_name[DISK_NAME_LEN]; /* name of major driver */
131
132 unsigned short events; /* supported events */
133 unsigned short event_flags; /* flags related to event processing */
134
135 struct xarray part_tbl;
136 struct block_device *part0;
137
138 const struct block_device_operations *fops;
139 struct request_queue *queue;
...
}
gendisk中的part_tbl就代表该磁盘的分区表,那么每个分区用什么结构体表示:struct hd_struct
注意gendisk结构体中还有一个重要的fops函数,代表了操作该块设备的操作函数列表,具体本文后面详细讲述。
3. gendisk是怎么创建的
前面知道gendisk是通过blk_alloc_disk函数创建的:
275#define blk_alloc_disk(node_id) \
276({ \
277 static struct lock_class_key __key; \
278 \
279 __blk_alloc_disk(node_id, &__key); \
280})
1333struct gendisk *__blk_alloc_disk(int node, struct lock_class_key *lkclass)
1334{
1335 struct request_queue *q;
1336 struct gendisk *disk;
1337
1338 q = blk_alloc_queue(node);
1339 if (!q)
1340 return NULL;
1341
1342 disk = __alloc_disk_node(q, node, lkclass);
1343 if (!disk) {
1344 blk_cleanup_queue(q);
1345 return NULL;
1346 }
1347 return disk;
1348}
1279
1280struct gendisk *__alloc_disk_node(struct request_queue *q, int node_id,
1281 struct lock_class_key *lkclass)
1282{
1283 struct gendisk *disk;
1284
1285 if (!blk_get_queue(q))
1286 return NULL;
1287
1288 disk = kzalloc_node(sizeof(struct gendisk), GFP_KERNEL, node_id);
1289 if (!disk)
1290 goto out_put_queue;
1291
1292 disk->bdi = bdi_alloc(node_id);
1293 if (!disk->bdi)
1294 goto out_free_disk;
1295
1296 disk->part0 = bdev_alloc(disk, 0);
1297 if (!disk->part0)
1298 goto out_free_bdi;
1299
1300 disk->node_id = node_id;
1301 mutex_init(&disk->open_mutex);
1302 xa_init(&disk->part_tbl);
1303 if (xa_insert(&disk->part_tbl, 0, disk->part0, GFP_KERNEL))
1304 goto out_destroy_part_tbl;
1305
总结一下调用关系:blk_alloc_disk->__alloc_disk_node->__alloc_disk_node,最终__alloc_disk_node创建了gendisk对象。
创建了gendisk之后,要给gendisk做一些初始化赋值,其中很重要的part0是block_device,通过调用bdev_alloc(disk,0)创建,这里出现了本文最后一个要介绍的对象:struct block_device,
4 block_device是啥
block_device具体可以对应一个磁盘,也可以对应磁盘里面的一个分区,也就说磁盘和磁盘都可用block_device表示,block_device可以想象成磁盘的描述信息,比如设备号,分区号,是否只读等等,具体定义如下:
4struct block_device {
25 sector_t bd_start_sect;
26 struct disk_stats __percpu *bd_stats;
27 unsigned long bd_stamp;
28 bool bd_read_only; /* read-only policy */
29 dev_t bd_dev;
30 int bd_openers;
31 struct inode * bd_inode; /* will die */
32 struct super_block * bd_super;
33 void * bd_claiming;
34 struct device bd_device;
35 void * bd_holder;
36 int bd_holders;
37 bool bd_write_holder;
38 struct kobject *bd_holder_dir;
39 u8 bd_partno;
40 spinlock_t bd_size_lock; /* for bd_inode->i_size updates */
41 struct gendisk * bd_disk;
42
43 /* The counter of freeze processes */
44 int bd_fsfreeze_count;
45 /* Mutex for freeze */
46 struct mutex bd_fsfreeze_mutex;
47 struct super_block *bd_fsfreeze_sb;
48
49 struct partition_meta_info *bd_meta_info;
50#ifdef CONFIG_FAIL_MAKE_REQUEST
51 bool bd_make_it_fail;
52#endif
53
54 ANDROID_KABI_RESERVE(1);
55 ANDROID_KABI_RESERVE(2);
56 ANDROID_KABI_RESERVE(3);
57 ANDROID_KABI_RESERVE(4);
58} __randomize_layout;
5.block_device怎么创建的
bdev_alloc创建了disk->part0这个block_device对象,我们来看下非常重要的bdev_alloc函数:
478struct block_device *bdev_alloc(struct gendisk *disk, u8 partno)
479{
480 struct block_device *bdev;
481 struct inode *inode;
482
483 inode = new_inode(blockdev_superblock);
484 if (!inode)
485 return NULL;
//块设备文件对应inode设置为块设备
486 inode->i_mode = S_IFBLK;
487 inode->i_rdev = 0;
488 inode->i_data.a_ops = &def_blk_aops;
489 mapping_set_gfp_mask(&inode->i_data, GFP_USER);
490
//new_inode创建的本质上是bdev_inode,I_BDEV获取bdev_inode结构体的字段bdev
491 bdev = I_BDEV(inode);
492 mutex_init(&bdev->bd_fsfreeze_mutex);
493 spin_lock_init(&bdev->bd_size_lock);
//初始化block_device,设置分区号,inode,gendisk对象
494 bdev->bd_partno = partno;
495 bdev->bd_inode = inode;
496 bdev->bd_stats = alloc_percpu(struct disk_stats);
497 if (!bdev->bd_stats) {
498 iput(inode);
499 return NULL;
500 }
501 bdev->bd_disk = disk;
502 return bdev;
503}
上面new_inode函数调用可以参考:zram压缩机制看swapon系统调用_swapon设置为zram-CSDN博客
总结来讲new_inode返回的本质上是一个bdev_inode对象,其定义如下:
32struct bdev_inode {
33 struct block_device bdev;
34 struct inode vfs_inode;
35};
36
也就是说new_inode创建bdev_inode的同时,本质上也创建了一个block_device对象。这里bdev_inode就代表块设备文件的inode,比如zram驱动来讲,对应的就是/dev/block/zram0块设备文件的inode对象。
6. 激活磁盘
激活磁盘使用的是device_add_disk函数:
/**
* device_add_disk - add disk information to kernel list
* @parent: parent device for the disk
* @disk: per-device partitioning information
* @groups: Additional per-device sysfs groups
*
* This function registers the partitioning information in @disk
* with the kernel.
*/
int device_add_disk(struct device *parent, struct gendisk *disk,
const struct attribute_group **groups)
{
struct device *ddev = disk_to_dev(disk);
int ret;
/*
* The disk queue should now be all set with enough information about
* the device for the elevator code to pick an adequate default
* elevator if one is needed, that is, for devices requesting queue
* registration.
*/
elevator_init_mq(disk->queue);
/*
* If the driver provides an explicit major number it also must provide
* the number of minors numbers supported, and those will be used to
* setup the gendisk.
* Otherwise just allocate the device numbers for both the whole device
* and all partitions from the extended dev_t space.
*/
if (disk->major) {
if (WARN_ON(!disk->minors))
return -EINVAL;
if (disk->minors > DISK_MAX_PARTS) {
pr_err("block: can't allocate more than %d partitions\n",
DISK_MAX_PARTS);
disk->minors = DISK_MAX_PARTS;
}
if (disk->first_minor > MINORMASK ||
disk->minors > MINORMASK + 1 ||
disk->first_minor + disk->minors > MINORMASK + 1)
return -EINVAL;
} else {
if (WARN_ON(disk->minors))
return -EINVAL;
ret = blk_alloc_ext_minor();
if (ret < 0)
return ret;
disk->major = BLOCK_EXT_MAJOR;
disk->first_minor = ret;
disk->flags |= GENHD_FL_EXT_DEVT;
}
/* delay uevents, until we scanned partition table */
dev_set_uevent_suppress(ddev, 1);
ddev->parent = parent;
ddev->groups = groups;
dev_set_name(ddev, "%s", disk->disk_name);
if (!(disk->flags & GENHD_FL_HIDDEN))
ddev->devt = MKDEV(disk->major, disk->first_minor);
//非常重要的函数
ret = device_add(ddev);
if (ret)
goto out_free_ext_minor;
ret = disk_alloc_events(disk);
if (ret)
goto out_device_del;
if (!sysfs_deprecated) {
ret = sysfs_create_link(block_depr, &ddev->kobj,
kobject_name(&ddev->kobj));
if (ret)
goto out_device_del;
}
/*
* avoid probable deadlock caused by allocating memory with
* GFP_KERNEL in runtime_resume callback of its all ancestor
* devices
*/
pm_runtime_set_memalloc_noio(ddev, true);
ret = blk_integrity_add(disk);
if (ret)
goto out_del_block_link;
disk->part0->bd_holder_dir =
kobject_create_and_add("holders", &ddev->kobj);
if (!disk->part0->bd_holder_dir) {
ret = -ENOMEM;
goto out_del_integrity;
}
disk->slave_dir = kobject_create_and_add("slaves", &ddev->kobj);
if (!disk->slave_dir) {
ret = -ENOMEM;
goto out_put_holder_dir;
}
ret = bd_register_pending_holders(disk);
if (ret < 0)
goto out_put_slave_dir;
ret = blk_register_queue(disk);
if (ret)
goto out_put_slave_dir;
if (disk->flags & GENHD_FL_HIDDEN) {
/*
* Don't let hidden disks show up in /proc/partitions,
* and don't bother scanning for partitions either.
*/
disk->flags |= GENHD_FL_SUPPRESS_PARTITION_INFO;
disk->flags |= GENHD_FL_NO_PART;
} else {
ret = bdi_register(disk->bdi, "%u:%u",
disk->major, disk->first_minor);
if (ret)
goto out_unregister_queue;
bdi_set_owner(disk->bdi, ddev);
ret = sysfs_create_link(&ddev->kobj,
&disk->bdi->dev->kobj, "bdi");
if (ret)
goto out_unregister_bdi;
//非常重要
bdev_add(disk->part0, ddev->devt);
disk_scan_partitions(disk);
/*
* Announce the disk and partitions after all partitions are
* created. (for hidden disks uevents remain suppressed forever)
*/
dev_set_uevent_suppress(ddev, 0);
disk_uevent(disk, KOBJ_ADD);
}
disk_update_readahead(disk);
disk_add_events(disk);
return 0;
...
}
总结起来device_add_disk调用了两个非常重要的函数:
- device_add
- bdev_add
device_add函数
device_add
--->devtmpfs_create_node
devtmpfs会给devtmpfs文件系统的线程发送创建块文件的消息,类似mknode,然后再/dev/目录下创建出来块文件。
bdev_add函数:
void bdev_add(struct block_device *bdev, dev_t dev)
{
bdev->bd_dev = dev;
bdev->bd_inode->i_rdev = dev;
bdev->bd_inode->i_ino = dev;
insert_inode_hash(bdev->bd_inode);
}
设置block_device->device的bd_dev为块设备号,同时设置block_device->bd_inode的i_rdev为块设备号,同时insert_inode_hash函数 block_device的bd_inode添加到superblock的inode hash表中,这里逻辑非常重要,对理解swapon非常重要,我们知道swapon系统调用有如下一段代码:
swapon系统调用:
swap_file = file_open_name(name, O_RDWR|O_LARGEFILE, 0);
if (IS_ERR(swap_file)) {
error = PTR_ERR(swap_file);
swap_file = NULL;
goto bad_swap;
}
p->swap_file = swap_file;
mapping = swap_file->f_mapping;
dentry = swap_file->f_path.dentry;
inode = mapping->host;
static int claim_swapfile(struct swap_info_struct *p, struct inode *inode)
{
int error;
if (S_ISBLK(inode->i_mode)) {
p->bdev = blkdev_get_by_dev(inode->i_rdev,
FMODE_READ | FMODE_WRITE | FMODE_EXCL, p);
if (IS_ERR(p->bdev)) {
error = PTR_ERR(p->bdev);
p->bdev = NULL;
return error;
}
p->old_block_size = block_size(p->bdev);
error = set_blocksize(p->bdev, PAGE_SIZE);
if (error < 0)
return error;
/*
* Zoned block devices contain zones that have a sequential
* write only restriction. Hence zoned block devices are not
* suitable for swapping. Disallow them here.
*/
if (blk_queue_is_zoned(p->bdev->bd_disk->queue))
return -EINVAL;
p->flags |= SWP_BLKDEV;
} else if (S_ISREG(inode->i_mode)) {
p->bdev = inode->i_sb->s_bdev;
}
return 0;
}
blkdev_get_by_dev函数返回了一个block_device,这个block_device跟前面zram块驱动blk_alloc_disk 生成的block_device有啥关系?就是同一个,我们还是要从源码视角看懂这一切:
struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder)
{
bool unblock_events = true;
struct block_device *bdev;
struct gendisk *disk;
int ret;
ret = devcgroup_check_permission(DEVCG_DEV_BLOCK,
MAJOR(dev), MINOR(dev),
((mode & FMODE_READ) ? DEVCG_ACC_READ : 0) |
((mode & FMODE_WRITE) ? DEVCG_ACC_WRITE : 0));
if (ret)
return ERR_PTR(ret);
bdev = blkdev_get_no_open(dev);
if (!bdev)
return ERR_PTR(-ENXIO);
disk = bdev->bd_disk;
...
return bdev;
}
struct block_device *blkdev_get_no_open(dev_t dev)
{
struct block_device *bdev;
struct inode *inode;
inode = ilookup(blockdev_superblock, dev);
if (!inode) {
blk_request_module(dev);
inode = ilookup(blockdev_superblock, dev);
if (!inode)
return NULL;
}
/* switch from the inode reference to a device mode one: */
bdev = &BDEV_I(inode)->bdev;
if (!kobject_get_unless_zero(&bdev->bd_device.kobj))
bdev = NULL;
iput(inode);
if (!bdev)
return NULL;
if ((bdev->bd_disk->flags & GENHD_FL_HIDDEN) ||
!try_module_get(bdev->bd_disk->fops->owner)) {
put_device(&bdev->bd_device);
return NULL;
}
return bdev;
}
inode = ilookup(blockdev_superblock, dev);根据块设备号dev,从blockdev_superblock拿到inode节点,为什么这里能拿到块设备文件(/dev/block/zram0)的inode,就是因为bdev_add时候将inode对象加入到blockdev_superblock的inode hash表中了,这里就能拿到。
参考文章:
块设备剖析之关键数据结构分析 - block_device/gendisk/hd_struct-下雨夜-ChinaUnix博客