转自:Linux 虚拟文件系统 VFS 源码剖析 -- 以 ext4 为例(Part1) - 知乎
虚拟文件系统(Virtual File System, VFS)是 Linux 内核的一个组件,用于处理与文件和文件系统相关的所有系统调用。VFS 是内核提供文件系统接口给用户态应用程序通用接口层,同时也提供了抽象化操作接口,以便底层各种文件系统实现。
1、VFS 数据结构
VFS 涉及到的数据结构比较复杂,比较重要的是下面 5 个:
- file_system_type
- super_block
- dentry
- inode
- file
file_system_type 表示一种文件系统,比如 ext2、ext4 等。
而 super_block 代表具体某个已经挂载的文件系统,标识一个文件系统实例。每个物理的磁盘、硬盘都有一个文件控制块 FCB,super_block 相当于 FCB 的内存映像。
dentry 表示目录,VFS 将目录也当作文件,也有一个 inode,不过 dentry 的操作函数和普通文件操作函数不同。
inode 包含了内核在操作文件或目录时需要的全部信息。对于 UNIX 风格的文件系统,这些信息可以根据需要从磁盘索引结点直接读入或者写会磁盘。磁盘上的一个索引结点代表一个文件,内核中一个 inode 代表打开的一个文件。
file 是从进程角度表示一个打开的文件。
2、file_system_type
Linux 使用 file_system_type 表示一种文件系统,比如 ext2、ext4、exfat 等等。文件系统可以有多个实例,每个实例都使用 super_block 表征。
/// include/linux/fs.h
2230 struct file_system_type {
2231 const char *name;
2232 int fs_flags;
2233 #define FS_REQUIRES_DEV 1
2234 #define FS_BINARY_MOUNTDATA 2
2235 #define FS_HAS_SUBTYPE 4
2236 #define FS_USERNS_MOUNT 8 /* Can be mounted by userns root */
2237 #define FS_DISALLOW_NOTIFY_PERM 16 /* Disable fanotify permission events */
2238 #define FS_THP_SUPPORT 8192 /* Remove once all fs converted */
2239 #define FS_RENAME_DOES_D_MOVE 32768 /* FS will handle d_move() during rename() internally. */
2240 int (*init_fs_context)(struct fs_context *);
2241 const struct fs_parameter_spec *parameters;
2242 struct dentry *(*mount) (struct file_system_type *, int,
2243 const char *, void *);
2244 void (*kill_sb) (struct super_block *);
2245 struct module *owner;
2246 struct file_system_type * next;
2247 struct hlist_head fs_supers;
2248
2249 struct lock_class_key s_lock_key;
2250 struct lock_class_key s_umount_key;
2251 struct lock_class_key s_vfs_rename_key;
2252 struct lock_class_key s_writers_key[SB_FREEZE_LEVELS];
2253
2254 struct lock_class_key i_lock_key;
2255 struct lock_class_key i_mutex_key;
2256 struct lock_class_key i_mutex_dir_key;
2257 };
- name 表示文件系统名字,比如 ext2、ext4;
- fs_flags 是一些 FS_ 标志位;
- init_fs_context/parameters 和 fs_context 有关系;
- mount 函数指针指向挂载文件系统实例的函数;
- kill_sb ==========
- next 指针用于将文件系统链接成链表;
- fs_supers 是链表头,用于链接文件系统的所有实例;
2.1、register_filesystem()/unregister_filesystem()
在挂载某个文件系统实例前,Linux 必须支持该文件系统。换句话说,需要将该文件系统注册到 Linux。向 Linux 注册文件系统是通过 register_filesystem() 函数。
Linux 使用链表管理注册的文件系统,所有的文件系统都被链接到一个链表上。Linux 中的文件系统链表没有头节点,全局变量 file_systems 指向该链表的第一个元素。
/// fs/filesystem.c
34 static struct file_system_type *file_systems;
35 static DEFINE_RWLOCK(file_systems_lock);
Linux 不允许同一个文件系统重复注册。find_filesystem() 函数可以查找时候已经注册某个文件系统。find_filesystem() 函数返回的是指针的指针,如果存在,*p 指向该文件系统,否则 *p 为 NULL。find_filesystem() 函数的处理手法值得我们学习:如何将查找和插入结合,在查找失败时,可以利用查找结果直接插入元素。
/// fs/filesystem.c
49 static struct file_system_type **find_filesystem(const char *name, unsigned len)
50 {
51 struct file_system_type **p;
52 for (p = &file_systems; *p; p = &(*p)->next)
53 if (strncmp((*p)->name, name, len) == 0 &&
54 !(*p)->name[len])
55 break;
56 return p;
57 }
register_filesystem() 函数将一个文件系统注册到全局链表 file_systems 上,如果已经存在,则返回 -EBUSY,表示注册失败。其主要逻辑时调用 find_filesystem() 函数查找链表中是否已经存在待注册文件系统,不存在时才将待注册文件系统添加到链表中。
/// fs/filesystem.c
72 int register_filesystem(struct file_system_type * fs)
73 {
74 int res = 0;
75 struct file_system_type ** p;
76
77 if (fs->parameters &&
78 !fs_validate_description(fs->name, fs->parameters))
79 return -EINVAL;
80
81 BUG_ON(strchr(fs->name, '.'));
82 if (fs->next)
83 return -EBUSY;
84 write_lock(&file_systems_lock);
85 p = find_filesystem(fs->name, strlen(fs->name));
86 if (*p)
87 res = -EBUSY;
88 else
89 *p = fs;
90 write_unlock(&file_systems_lock);
91 return res;
92 }
unregister_filesystem() 函数是从全局链表 file_systems 中删除某个文件系统。
/// fs/filesystem.c
108 int unregister_filesystem(struct file_system_type * fs)
109 {
110 struct file_system_type ** tmp;
111
112 write_lock(&file_systems_lock);
113 tmp = &file_systems;
114 while (*tmp) {
115 if (fs == *tmp) {
116 *tmp = fs->next;
117 fs->next = NULL;
118 write_unlock(&file_systems_lock);
119 synchronize_rcu();
120 return 0;
121 }
122 tmp = &(*tmp)->next;
123 }
124 write_unlock(&file_systems_lock);
125
126 return -EINVAL;
127 }
2.2、get_fs_type()
get_fs_type() 函数根据名字 name 查找是否注册了某个文件系统,如果注册了就返回指向对应 file_system_type 的指针。
/// fs/filesystem.c
254 static struct file_system_type *__get_fs_type(const char *name, int len)
255 {
256 struct file_system_type *fs;
257
258 read_lock(&file_systems_lock);
259 fs = *(find_filesystem(name, len));
260 if (fs && !try_module_get(fs->owner))
261 fs = NULL;
262 read_unlock(&file_systems_lock);
263 return fs;
264 }
265
266 struct file_system_type *get_fs_type(const char *name)
267 {
268 struct file_system_type *fs;
269 const char *dot = strchr(name, '.');
270 int len = dot ? dot - name : strlen(name);
271
272 fs = __get_fs_type(name, len);
273 if (!fs && (request_module("fs-%.*s", len, name) == 0)) {
274 fs = __get_fs_type(name, len);
275 if (!fs)
276 pr_warn_once("request_module fs-%.*s succeeded, but still no fs?\n",
277 len, name);
278 }
279
280 if (dot && fs && !(fs->fs_flags & FS_HAS_SUBTYPE)) {
281 put_filesystem(fs);
282 fs = NULL;
283 }
284 return fs;
285 }
2.3、ext4_fs_type
ext4 在 module 初始化时调用 register_filesystem() 函数将 ext4_fs_type 注册到 Linux 系统中。ext4_fs_type 定义如下。mount 和 kill_sb 函数指针分别指向 ext4_mount() 和 kill_block_super() 两个函数。
/// fs/ext4/super.c
6681 static struct file_system_type ext4_fs_type = {
6682 .owner = THIS_MODULE,
6683 .name = "ext4",
6684 .mount = ext4_mount,
6685 .kill_sb = kill_block_super,
6686 .fs_flags = FS_REQUIRES_DEV,
6687 };
ext4_mount() 函数直接调用 mount_bdev() 函数。
/// fs/ext4/super.c
6619 static struct dentry *ext4_mount(struct file_system_type *fs_type, int flags,
6620 const char *dev_name, void *data)
6621 {
6622 return mount_bdev(fs_type, flags, dev_name, data, ext4_fill_super);
6623 }
mount_bdev() 和 kill_block_super() 两个函数都不是 ext4 专有函数,都是通用函数。
3、super_block
super_block 代表一个具体某个已经挂载的文件系统,标识一个文件系统实例的信息,比如:
- 依附的物理硬件
- 索引结点 inode 和数据块 block 的位置
- block 的大小(字节)
- 文件系统类型
- 最长文件名
- 最大文件大小
- 根目录的 inode 位置
- 支持的操作
3.1、super_operations
super_operations 中定义了超级块支持的操作,是一组函数指针,指向比如 inode 分配、销毁与释放,以及将 inode 数据写回磁盘等函数。
/// include/linux/fs.h
1935 struct super_operations {
1936 struct inode *(*alloc_inode)(struct super_block *sb);
1937 void (*destroy_inode)(struct inode *);
1938 void (*free_inode)(struct inode *);
1939
1940 void (*dirty_inode) (struct inode *, int flags);
1941 int (*write_inode) (struct inode *, struct writeback_control *wbc);
1942 int (*drop_inode) (struct inode *);
1943 void (*evict_inode) (struct inode *);
1944 void (*put_super) (struct super_block *);
1945 int (*sync_fs)(struct super_block *sb, int wait);
1946 int (*freeze_super) (struct super_block *);
1947 int (*freeze_fs) (struct super_block *);
1948 int (*thaw_super) (struct super_block *);
1949 int (*unfreeze_fs) (struct super_block *);
1950 int (*statfs) (struct dentry *, struct kstatfs *);
1951 int (*remount_fs) (struct super_block *, int *, char *);
1952 void (*umount_begin) (struct super_block *);
1953
1954 int (*show_options)(struct seq_file *, struct dentry *);
1955 int (*show_devname)(struct seq_file *, struct dentry *);
1956 int (*show_path)(struct seq_file *, struct dentry *);
1957 int (*show_stats)(struct seq_file *, struct dentry *);
1958 #ifdef CONFIG_QUOTA
1959 ssize_t (*quota_read)(struct super_block *, int, char *, size_t, loff_t);
1960 ssize_t (*quota_write)(struct super_block *, int, const char *, size_t, loff_t);
1961 struct dquot **(*get_dquots)(struct inode *);
1962 #endif
1963 int (*bdev_try_to_free_page)(struct super_block*, struct page*, gfp_t);
1964 long (*nr_cached_objects)(struct super_block *,
1965 struct shrink_control *);
1966 long (*free_cached_objects)(struct super_block *,
1967 struct shrink_control *);
1968 };
3.2、super_block
/// include/linux/fs.h
1416 struct super_block {
1417 struct list_head s_list; /* Keep this first */
1418 dev_t s_dev; /* search index; _not_ kdev_t */
1419 unsigned char s_blocksize_bits;
1420 unsigned long s_blocksize; /* 块大小 */
1421 loff_t s_maxbytes; /* Max file size */
1422 struct file_system_type *s_type;
1423 const struct super_operations *s_op;
1424 const struct dquot_operations *dq_op;
1425 const struct quotactl_ops *s_qcop;
1426 const struct export_operations *s_export_op;
1427 unsigned long s_flags;
1428 unsigned long s_iflags; /* internal SB_I_* flags */
1429 unsigned long s_magic;
1430 struct dentry *s_root; /* 挂载点 */
1431 struct rw_semaphore s_umount;
1432 int s_count; /* 引用计数 */
1433 atomic_t s_active;
1434 #ifdef CONFIG_SECURITY
1435 void *s_security;
1436 #endif
1437 const struct xattr_handler **s_xattr;
1438 #ifdef CONFIG_FS_ENCRYPTION
1439 const struct fscrypt_operations *s_cop;
1440 struct key *s_master_keys; /* master crypto keys in use */
1441 #endif
1442 #ifdef CONFIG_FS_VERITY
1443 const struct fsverity_operations *s_vop;
1444 #endif
1445 #ifdef CONFIG_UNICODE
1446 struct unicode_map *s_encoding;
1447 __u16 s_encoding_flags;
1448 #endif
1449 struct hlist_bl_head s_roots; /* alternate root dentries for NFS */
1450 struct list_head s_mounts; /* list of mounts; _not_ for fs use */
1451 struct block_device *s_bdev;
1452 struct backing_dev_info *s_bdi;
1453 struct mtd_info *s_mtd;
1454 struct hlist_node s_instances;
1455 unsigned int s_quota_types; /* Bitmask of supported quota types */
1456 struct quota_info s_dquot; /* Diskquota specific options */
1457
1458 struct sb_writers s_writers;
1459
1460 /*
1461 * Keep s_fs_info, s_time_gran, s_fsnotify_mask, and
1462 * s_fsnotify_marks together for cache efficiency. They are frequently
1463 * accessed and rarely modified.
1464 */
1465 void *s_fs_info; /* Filesystem private info */
1466
1467 /* Granularity of c/m/atime in ns (cannot be worse than a second) */
1468 u32 s_time_gran;
1469 /* Time limits for c/m/atime in seconds */
1470 time64_t s_time_min;
1471 time64_t s_time_max;
1472 #ifdef CONFIG_FSNOTIFY
1473 __u32 s_fsnotify_mask;
1474 struct fsnotify_mark_connector __rcu *s_fsnotify_marks;
1475 #endif
1476
1477 char s_id[32]; /* Informational name */
1478 uuid_t s_uuid; /* UUID */
1479
1480 unsigned int s_max_links;
1481 fmode_t s_mode;
1482
1483 /*
1484 * The next field is for VFS *only*. No filesystems have any business
1485 * even looking at it. You had been warned.
1486 */
1487 struct mutex s_vfs_rename_mutex; /* Kludge */
1488
1489 /*
1490 * Filesystem subtype. If non-empty the filesystem type field
1491 * in /proc/mounts will be "type.subtype"
1492 */
1493 const char *s_subtype;
1494
1495 const struct dentry_operations *s_d_op; /* default d_op for dentries */
1496
1497 /*
1498 * Saved pool identifier for cleancache (-1 means none)
1499 */
1500 int cleancache_poolid;
1501
1502 struct shrinker s_shrink; /* per-sb shrinker handle */
1503
1504 /* Number of inodes with nlink == 0 but still referenced */
1505 atomic_long_t s_remove_count;
1506
1507 /* Pending fsnotify inode refs */
1508 atomic_long_t s_fsnotify_inode_refs;
1509
1510 /* Being remounted read-only */
1511 int s_readonly_remount;
1512
1513 /* per-sb errseq_t for reporting writeback errors via syncfs */
1514 errseq_t s_wb_err;
1515
1516 /* AIO completions deferred from interrupt context */
1517 struct workqueue_struct *s_dio_done_wq;
1518 struct hlist_head s_pins;
1519
1520 /*
1521 * Owning user namespace and default context in which to
1522 * interpret filesystem uids, gids, quotas, device nodes,
1523 * xattrs and security labels.
1524 */
1525 struct user_namespace *s_user_ns;
1526
1527 /*
1528 * The list_lru structure is essentially just a pointer to a table
1529 * of per-node lru lists, each of which has its own spinlock.
1530 * There is no need to put them into separate cachelines.
1531 */
1532 struct list_lru s_dentry_lru;
1533 struct list_lru s_inode_lru;
1534 struct rcu_head rcu;
1535 struct work_struct destroy_work;
1536
1537 struct mutex s_sync_lock; /* sync serialisation lock */
1538
1539 /*
1540 * Indicates how deep in a filesystem stack this SB is
1541 */
1542 int s_stack_depth;
1543
1544 /* s_inode_list_lock protects s_inodes */
1545 spinlock_t s_inode_list_lock ____cacheline_aligned_in_smp;
1546 struct list_head s_inodes; /* all inodes */
1547
1548 spinlock_t s_inode_wblist_lock;
1549 struct list_head s_inodes_wb; /* writeback inodes */
1550 } __randomize_layout;
Linux 中挂载的 super_block 除了链接到对应的 file_system_type::fs_supers 链表上,还将其链接到全局链表 super_blocks 中。
/// fs/super.c
45 static LIST_HEAD(super_blocks);
46 static DEFINE_SPINLOCK(sb_lock);
3.4、ext4_sops
/// fs/ext4/super.c
1664 static const struct super_operations ext4_sops = {
1665 .alloc_inode = ext4_alloc_inode,
1666 .free_inode = ext4_free_in_core_inode,
1667 .destroy_inode = ext4_destroy_inode,
1668 .write_inode = ext4_write_inode,
1669 .dirty_inode = ext4_dirty_inode,
1670 .drop_inode = ext4_drop_inode,
1671 .evict_inode = ext4_evict_inode,
1672 .put_super = ext4_put_super,
1673 .sync_fs = ext4_sync_fs,
1674 .freeze_fs = ext4_freeze,
1675 .unfreeze_fs = ext4_unfreeze,
1676 .statfs = ext4_statfs,
1677 .remount_fs = ext4_remount,
1678 .show_options = ext4_show_options,
1679 #ifdef CONFIG_QUOTA
1680 .quota_read = ext4_quota_read,
1681 .quota_write = ext4_quota_write,
1682 .get_dquots = ext4_get_dquots,
1683 #endif
1684 .bdev_try_to_free_page = bdev_try_to_free_page,
1685 };
4、inode
索引结点 inode 包含了内核在操作文件或目录时(目录也被当作文件看待)需要的全部信息。对于 UNIX 风格的文件系统,这些信息可以根据需要从磁盘索引结点直接读入或者写会磁盘。磁盘上的一个索引结点代表一个文件,内核中一个 inode 代表打开的一个文件。
- 文件类型
- 文件大小
- 访问权限
- 访问或修改时间
- 文件位置(指向磁盘数据块)
4.1、inode_operations
/// include/linux/fs.h
1864 struct inode_operations {
1865 struct dentry * (*lookup) (struct inode *,struct dentry *, unsigned int);
1866 const char * (*get_link) (struct dentry *, struct inode *, struct delayed_call *);
1867 int (*permission) (struct inode *, int);
1868 struct posix_acl * (*get_acl)(struct inode *, int);
1869
1870 int (*readlink) (struct dentry *, char __user *,int);
1871
1872 int (*create) (struct inode *,struct dentry *, umode_t, bool);
1873 int (*link) (struct dentry *,struct inode *,struct dentry *);
1874 int (*unlink) (struct inode *,struct dentry *);
1875 int (*symlink) (struct inode *,struct dentry *,const char *);
1876 int (*mkdir) (struct inode *,struct dentry *,umode_t);
1877 int (*rmdir) (struct inode *,struct dentry *);
1878 int (*mknod) (struct inode *,struct dentry *,umode_t,dev_t);
1879 int (*rename) (struct inode *, struct dentry *,
1880 struct inode *, struct dentry *, unsigned int);
1881 int (*setattr) (struct dentry *, struct iattr *);
1882 int (*getattr) (const struct path *, struct kstat *, u32, unsigned int);
1883 ssize_t (*listxattr) (struct dentry *, char *, size_t);
1884 int (*fiemap)(struct inode *, struct fiemap_extent_info *, u64 start,
1885 u64 len);
1886 int (*update_time)(struct inode *, struct timespec64 *, int);
1887 int (*atomic_open)(struct inode *, struct dentry *,
1888 struct file *, unsigned open_flag,
1889 umode_t create_mode);
1890 int (*tmpfile) (struct inode *, struct dentry *, umode_t);
1891 int (*set_acl)(struct inode *, struct posix_acl *, int);
1892 } ____cacheline_aligned;
4.2、inode
文件和目录都有一个 inode,但是 i_op 指向的 inode_operations 实现却是不同。
605 /*
606 * Keep mostly read-only and often accessed (especially for
607 * the RCU path lookup and 'stat' data) fields at the beginning
608 * of the 'struct inode'
609 */
610 struct inode {
611 umode_t i_mode;
612 unsigned short i_opflags;
613 kuid_t i_uid;
614 kgid_t i_gid;
615 unsigned int i_flags;
616
617 #ifdef CONFIG_FS_POSIX_ACL
618 struct posix_acl *i_acl;
619 struct posix_acl *i_default_acl;
620 #endif
621
622 const struct inode_operations *i_op;
623 struct super_block *i_sb;
624 struct address_space *i_mapping;
625
626 #ifdef CONFIG_SECURITY
627 void *i_security;
628 #endif
629
630 /* Stat data, not accessed from path walking */
631 unsigned long i_ino;
632 /*
633 * Filesystems may only read i_nlink directly. They shall use the
634 * following functions for modification:
635 *
636 * (set|clear|inc|drop)_nlink
637 * inode_(inc|dec)_link_count
638 */
639 union {
640 const unsigned int i_nlink;
641 unsigned int __i_nlink;
642 };
643 dev_t i_rdev;
644 loff_t i_size;
645 struct timespec64 i_atime;
646 struct timespec64 i_mtime;
647 struct timespec64 i_ctime;
648 spinlock_t i_lock; /* i_blocks, i_bytes, maybe i_size */
649 unsigned short i_bytes;
650 u8 i_blkbits;
651 u8 i_write_hint;
652 blkcnt_t i_blocks;
653
654 #ifdef __NEED_I_SIZE_ORDERED
655 seqcount_t i_size_seqcount;
656 #endif
657
658 /* Misc */
659 unsigned long i_state;
660 struct rw_semaphore i_rwsem;
661
662 unsigned long dirtied_when; /* jiffies of first dirtying */
663 unsigned long dirtied_time_when;
664
665 struct hlist_node i_hash;
666 struct list_head i_io_list; /* backing dev IO list */
667 #ifdef CONFIG_CGROUP_WRITEBACK
668 struct bdi_writeback *i_wb; /* the associated cgroup wb */
669
670 /* foreign inode detection, see wbc_detach_inode() */
671 int i_wb_frn_winner;
672 u16 i_wb_frn_avg_time;
673 u16 i_wb_frn_history;
674 #endif
675 struct list_head i_lru; /* inode LRU list */
676 struct list_head i_sb_list;
677 struct list_head i_wb_list; /* backing dev writeback list */
678 union {
679 struct hlist_head i_dentry;
680 struct rcu_head i_rcu;
681 };
682 atomic64_t i_version;
683 atomic64_t i_sequence; /* see futex */
684 atomic_t i_count;
685 atomic_t i_dio_count;
686 atomic_t i_writecount;
687 #if defined(CONFIG_IMA) || defined(CONFIG_FILE_LOCKING)
688 atomic_t i_readcount; /* struct files open RO */
689 #endif
690 union {
691 const struct file_operations *i_fop; /* former ->i_op->default_file_ops */
692 void (*free_inode)(struct inode *);
693 };
694 struct file_lock_context *i_flctx;
695 struct address_space i_data;
696 struct list_head i_devices;
697 union {
698 struct pipe_inode_info *i_pipe;
699 struct block_device *i_bdev;
700 struct cdev *i_cdev;
701 char *i_link;
702 unsigned i_dir_seq;
703 };
704
705 __u32 i_generation;
706
707 #ifdef CONFIG_FSNOTIFY
708 __u32 i_fsnotify_mask; /* all events this inode cares about */
709 struct fsnotify_mark_connector __rcu *i_fsnotify_marks;
710 #endif
711
712 #ifdef CONFIG_FS_ENCRYPTION
713 struct fscrypt_info *i_crypt_info;
714 #endif
715
716 #ifdef CONFIG_FS_VERITY
717 struct fsverity_info *i_verity_info;
718 #endif
719
720 void *i_private; /* fs or device private pointer */
721 } __randomize_layout;
4.3、ext4_file_inode_operations
/// fs/ext4/file.c
930 const struct inode_operations ext4_file_inode_operations = {
931 .setattr = ext4_setattr,
932 .getattr = ext4_file_getattr,
933 .listxattr = ext4_listxattr,
934 .get_acl = ext4_get_acl,
935 .set_acl = ext4_set_acl,
936 .fiemap = ext4_fiemap,
937 };
4.4、ext4_dir_inode_operations
/// fs/ext4/namei.c
4194 /*
4195 * directories can handle most operations...
4196 */
4197 const struct inode_operations ext4_dir_inode_operations = {
4198 .create = ext4_create,
4199 .lookup = ext4_lookup,
4200 .link = ext4_link,
4201 .unlink = ext4_unlink,
4202 .symlink = ext4_symlink,
4203 .mkdir = ext4_mkdir,
4204 .rmdir = ext4_rmdir,
4205 .mknod = ext4_mknod,
4206 .tmpfile = ext4_tmpfile,
4207 .rename = ext4_rename2,
4208 .setattr = ext4_setattr,
4209 .getattr = ext4_getattr,
4210 .listxattr = ext4_listxattr,
4211 .get_acl = ext4_get_acl,
4212 .set_acl = ext4_set_acl,
4213 .fiemap = ext4_fiemap,
4214 };
4.5、ext4_special_inode_operations
/// fs/ext4/namei.c
4216 const struct inode_operations ext4_special_inode_operations = {
4217 .setattr = ext4_setattr,
4218 .getattr = ext4_getattr,
4219 .listxattr = ext4_listxattr,
4220 .get_acl = ext4_get_acl,
4221 .set_acl = ext4_set_acl,
4222 };
5、dentry
dentry 表示一个目录,Linux 系统将目录也当作一个文件,文件内容是文件名或者目录名。
5.1、dentry_operations
/// include/linux/dcache.h
135 struct dentry_operations {
136 int (*d_revalidate)(struct dentry *, unsigned int);
137 int (*d_weak_revalidate)(struct dentry *, unsigned int);
138 int (*d_hash)(const struct dentry *, struct qstr *);
139 int (*d_compare)(const struct dentry *,
140 unsigned int, const char *, const struct qstr *);
141 int (*d_delete)(const struct dentry *);
142 int (*d_init)(struct dentry *);
143 void (*d_release)(struct dentry *);
144 void (*d_prune)(struct dentry *);
145 void (*d_iput)(struct dentry *, struct inode *);
146 char *(*d_dname)(struct dentry *, char *, int);
147 struct vfsmount *(*d_automount)(struct path *);
148 int (*d_manage)(const struct path *, bool);
149 struct dentry *(*d_real)(struct dentry *, const struct inode *);
150 } ____cacheline_aligned;
5.2、dentry
/// include/linux/dcache.h
89 struct dentry {
90 /* RCU lookup touched fields */
91 unsigned int d_flags; /* protected by d_lock */
92 seqcount_spinlock_t d_seq; /* per dentry seqlock */
93 struct hlist_bl_node d_hash; /* lookup hash list */
94 struct dentry *d_parent; /* parent directory */
95 struct qstr d_name;
96 struct inode *d_inode; /* Where the name belongs to - NULL is
97 * negative */
98 unsigned char d_iname[DNAME_INLINE_LEN]; /* small names */
99
100 /* Ref lookup also touches following */
101 struct lockref d_lockref; /* per-dentry lock and refcount */
102 const struct dentry_operations *d_op;
103 struct super_block *d_sb; /* The root of the dentry tree */
104 unsigned long d_time; /* used by d_revalidate */
105 void *d_fsdata; /* fs-specific data */
106
107 union {
108 struct list_head d_lru; /* LRU list */
109 wait_queue_head_t *d_wait; /* in-lookup ones only */
110 };
111 struct list_head d_child; /* child of parent list */
112 struct list_head d_subdirs; /* our children */
113 /*
114 * d_alias and d_rcu can share memory
115 */
116 union {
117 struct hlist_node d_alias; /* inode alias list */
118 struct hlist_bl_node d_in_lookup_hash; /* only for in-lookup ones */
119 struct rcu_head d_rcu;
120 } d_u;
121 } __randomize_layout;
5.3、ext4_dentry_ops
/// fs/ext4/dir.c
671 #ifdef CONFIG_UNICODE
672 const struct dentry_operations ext4_dentry_ops = {
673 .d_hash = generic_ci_d_hash,
674 .d_compare = generic_ci_d_compare,
675 };
676 #endif
6、file
从进程的角度,标识打开的文件。主要维持如下信息
- 文件读写的标记的位置
- 打开文件的权限
- 指向 inode 的指针
6.1、file_operations
/// include/linux/fs.h
1822 struct file_operations {
1823 struct module *owner;
1824 loff_t (*llseek) (struct file *, loff_t, int);
1825 ssize_t (*read) (struct file *, char __user *, size_t, loff_t *);
1826 ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *);
1827 ssize_t (*read_iter) (struct kiocb *, struct iov_iter *);
1828 ssize_t (*write_iter) (struct kiocb *, struct iov_iter *);
1829 int (*iopoll)(struct kiocb *kiocb, bool spin);
1830 int (*iterate) (struct file *, struct dir_context *);
1831 int (*iterate_shared) (struct file *, struct dir_context *);
1832 __poll_t (*poll) (struct file *, struct poll_table_struct *);
1833 long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long);
1834 long (*compat_ioctl) (struct file *, unsigned int, unsigned long);
1835 int (*mmap) (struct file *, struct vm_area_struct *);
1836 unsigned long mmap_supported_flags;
1837 int (*open) (struct inode *, struct file *);
1838 int (*flush) (struct file *, fl_owner_t id);
1839 int (*release) (struct inode *, struct file *);
1840 int (*fsync) (struct file *, loff_t, loff_t, int datasync);
1841 int (*fasync) (int, struct file *, int);
1842 int (*lock) (struct file *, int, struct file_lock *);
1843 ssize_t (*sendpage) (struct file *, struct page *, int, size_t, loff_t *, int);
1844 unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long);
1845 int (*check_flags)(int);
1846 int (*flock) (struct file *, int, struct file_lock *);
1847 ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, loff_t *, size_t, unsigned int);
1848 ssize_t (*splice_read)(struct file *, loff_t *, struct pipe_inode_info *, size_t, unsigned int);
1849 int (*setlease)(struct file *, long, struct file_lock **, void **);
1850 long (*fallocate)(struct file *file, int mode, loff_t offset,
1851 loff_t len);
1852 void (*show_fdinfo)(struct seq_file *m, struct file *f);
1853 #ifndef CONFIG_MMU
1854 unsigned (*mmap_capabilities)(struct file *);
1855 #endif
1856 ssize_t (*copy_file_range)(struct file *, loff_t, struct file *,
1857 loff_t, size_t, unsigned int);
1858 loff_t (*remap_file_range)(struct file *file_in, loff_t pos_in,
1859 struct file *file_out, loff_t pos_out,
1860 loff_t len, unsigned int remap_flags);
1861 int (*fadvise)(struct file *, loff_t, loff_t, int);
1862 } __randomize_layout;
5.2、file
/// include/linux/fs.h
916 struct file {
917 union {
918 struct llist_node fu_llist;
919 struct rcu_head fu_rcuhead;
920 } f_u;
921 struct path f_path;
922 struct inode *f_inode; /* cached value */
923 const struct file_operations *f_op;
924
925 /*
926 * Protects f_ep_links, f_flags.
927 * Must not be taken from IRQ context.
928 */
929 spinlock_t f_lock;
930 enum rw_hint f_write_hint;
931 atomic_long_t f_count;
932 unsigned int f_flags;
933 fmode_t f_mode;
934 struct mutex f_pos_lock;
935 loff_t f_pos;
936 struct fown_struct f_owner;
937 const struct cred *f_cred;
938 struct file_ra_state f_ra;
939
940 u64 f_version;
941 #ifdef CONFIG_SECURITY
942 void *f_security;
943 #endif
944 /* needed for tty driver, and maybe others */
945 void *private_data;
946
947 #ifdef CONFIG_EPOLL
948 /* Used by fs/eventpoll.c to link all the hooks to this file */
949 struct list_head f_ep_links;
950 struct list_head f_tfile_llink;
951 #endif /* #ifdef CONFIG_EPOLL */
952 struct address_space *f_mapping;
953 errseq_t f_wb_err;
954 errseq_t f_sb_err; /* for syncfs */
955 } __randomize_layout
956 __attribute__((aligned(4))); /* lest something weird decides that 2 is OK */
6.3、ext4_file_operations
/// fs/ext4/file.c
910 const struct file_operations ext4_file_operations = {
911 .llseek = ext4_llseek,
912 .read_iter = ext4_file_read_iter,
913 .write_iter = ext4_file_write_iter,
914 .iopoll = iomap_dio_iopoll,
915 .unlocked_ioctl = ext4_ioctl,
916 #ifdef CONFIG_COMPAT
917 .compat_ioctl = ext4_compat_ioctl,
918 #endif
919 .mmap = ext4_file_mmap,
920 .mmap_supported_flags = MAP_SYNC,
921 .open = ext4_file_open,
922 .release = ext4_release_file,
923 .fsync = ext4_sync_file,
924 .get_unmapped_area = thp_get_unmapped_area,
925 .splice_read = generic_file_splice_read,
926 .splice_write = iter_file_splice_write,
927 .fallocate = ext4_fallocate,
928 };
6.4、ext4_dir_operations
目录的文件操作
/// fs/ext4/dir.c
658 const struct file_operations ext4_dir_operations = {
659 .llseek = ext4_dir_llseek,
660 .read = generic_read_dir,
661 .iterate_shared = ext4_readdir,
662 .unlocked_ioctl = ext4_ioctl,
663 #ifdef CONFIG_COMPAT
664 .compat_ioctl = ext4_compat_ioctl,
665 #endif
666 .fsync = ext4_sync_file,
667 .open = ext4_dir_open,
668 .release = ext4_release_dir,
669 };
670
671 #ifdef CONFIG_UNICODE
672 const struct dentry_operations ext4_dentry_ops = {
673 .d_hash = generic_ci_d_hash,
674 .d_compare = generic_ci_d_compare,
675 };
676 #endif
7、fs_context
fs_context 是 file_system_type 和 super_block 之间的桥梁,创建和配置 super_block 都离不开 fs_context,主要在 mount 调用时使用。
7.1、fs_context_operations
/// include/linux/fs_context.h
115 struct fs_context_operations {
116 void (*free)(struct fs_context *fc);
117 int (*dup)(struct fs_context *fc, struct fs_context *src_fc);
118 int (*parse_param)(struct fs_context *fc, struct fs_parameter *param);
119 int (*parse_monolithic)(struct fs_context *fc, void *data);
120 int (*get_tree)(struct fs_context *fc);
121 int (*reconfigure)(struct fs_context *fc);
122 };
7.2、fs_context
/// include/linux/fs_context.h
90 struct fs_context {
91 const struct fs_context_operations *ops;
92 struct mutex uapi_mutex; /* Userspace access mutex */
93 struct file_system_type *fs_type;
94 void *fs_private; /* The filesystem's context */
95 void *sget_key;
96 struct dentry *root; /* The root and superblock */
97 struct user_namespace *user_ns; /* The user namespace for this mount */
98 struct net *net_ns; /* The network namespace for this mount */
99 const struct cred *cred; /* The mounter's credentials */
100 struct p_log log; /* Logging buffer */
101 const char *source; /* The source name (eg. dev path) */
102 void *security; /* Linux S&M options */
103 void *s_fs_info; /* Proposed s_fs_info */
104 unsigned int sb_flags; /* Proposed superblock flags (SB_*) */
105 unsigned int sb_flags_mask; /* Superblock flags that were changed */
106 unsigned int s_iflags; /* OR'd with sb->s_iflags */
107 unsigned int lsm_flags; /* Information flags from the fs to the LSM */
108 enum fs_context_purpose purpose:8;
109 enum fs_context_phase phase:8; /* The phase the context is in */
110 bool need_free:1; /* Need to call ops->free() */
111 bool global:1; /* Goes into &init_user_ns */
112 bool oldapi:1; /* Coming from mount(2) */
113 };
8、ext4_mount
在 ext4_fs_type 这节介绍时,ext4_mount 直接调用 mount_bdev 函数。
8.1、mount_bdev() 函数定义如下。
- 调用 blkdev_get_by_path() 函数打开一个 block 设备
- 调用 sget() 查找或许新建一个 super_block
/// fs/super.c
1365 struct dentry *mount_bdev(struct file_system_type *fs_type,
1366 int flags, const char *dev_name, void *data,
1367 int (*fill_super)(struct super_block *, void *, int))
1368 {
1369 struct block_device *bdev;
1370 struct super_block *s;
1371 fmode_t mode = FMODE_READ | FMODE_EXCL;
1372 int error = 0;
1373
1374 if (!(flags & SB_RDONLY))
1375 mode |= FMODE_WRITE;
1376
1377 bdev = blkdev_get_by_path(dev_name, mode, fs_type);
1378 if (IS_ERR(bdev))
1379 return ERR_CAST(bdev);
1380
1381 /*
1382 * once the super is inserted into the list by sget, s_umount
1383 * will protect the lockfs code from trying to start a snapshot
1384 * while we are mounting
1385 */
1386 mutex_lock(&bdev->bd_fsfreeze_mutex);
1387 if (bdev->bd_fsfreeze_count > 0) {
1388 mutex_unlock(&bdev->bd_fsfreeze_mutex);
1389 error = -EBUSY;
1390 goto error_bdev;
1391 }
1392 s = sget(fs_type, test_bdev_super, set_bdev_super, flags | SB_NOSEC,
1393 bdev);
1394 mutex_unlock(&bdev->bd_fsfreeze_mutex);
1395 if (IS_ERR(s))
1396 goto error_s;
1397
1398 if (s->s_root) {
1399 if ((flags ^ s->s_flags) & SB_RDONLY) {
1400 deactivate_locked_super(s);
1401 error = -EBUSY;
1402 goto error_bdev;
1403 }
1404
1405 /*
1406 * s_umount nests inside bd_mutex during
1407 * __invalidate_device(). blkdev_put() acquires
1408 * bd_mutex and can't be called under s_umount. Drop
1409 * s_umount temporarily. This is safe as we're
1410 * holding an active reference.
1411 */
1412 up_write(&s->s_umount);
1413 blkdev_put(bdev, mode);
1414 down_write(&s->s_umount);
1415 } else {
1416 s->s_mode = mode;
1417 snprintf(s->s_id, sizeof(s->s_id), "%pg", bdev);
1418 sb_set_blocksize(s, block_size(bdev));
1419 error = fill_super(s, data, flags & SB_SILENT ? 1 : 0);
1420 if (error) {
1421 deactivate_locked_super(s);
1422 goto error;
1423 }
1424
1425 s->s_flags |= SB_ACTIVE;
1426 bdev->bd_super = s;
1427 }
1428
1429 return dget(s->s_root);
1430
1431 error_s:
1432 error = PTR_ERR(s);
1433 error_bdev:
1434 blkdev_put(bdev, mode);
1435 error:
1436 return ERR_PTR(error);
1437 }
传入 sget() 函数的set_bdev_super 和 test_bdev_super 是两个函数指针,其定义如下:
/// fs/super.c
1253 static int set_bdev_super(struct super_block *s, void *data)
1254 {
1255 s->s_bdev = data;
1256 s->s_dev = s->s_bdev->bd_dev;
1257 s->s_bdi = bdi_get(s->s_bdev->bd_bdi);
1258
1259 if (blk_queue_stable_writes(s->s_bdev->bd_disk->queue))
1260 s->s_iflags |= SB_I_STABLE_WRITES;
1261 return 0;
1262 }
1360 static int test_bdev_super(struct super_block *s, void *data)
1361 {
1362 return (void *)s->s_bdev == data;
1363 }
8.2、sget()
sget() 函数用于查找或许新建一个 super_block 结构。首先在 file_system_type::fs_supers 链表上查找一个未使用的 super_block 结构,同时也会释放不用的 super_block。如果没有找到,就调用 alloc_super() 函数重新申请一个 super_block 结构。
/// fs/super.c
568 /**
569 * sget - find or create a superblock
570 * @type: filesystem type superblock should belong to
571 * @test: comparison callback
572 * @set: setup callback
573 * @flags: mount flags
574 * @data: argument to each of them
575 */
576 struct super_block *sget(struct file_system_type *type,
577 int (*test)(struct super_block *,void *),
578 int (*set)(struct super_block *,void *),
579 int flags,
580 void *data)
581 {
582 struct user_namespace *user_ns = current_user_ns();
583 struct super_block *s = NULL;
584 struct super_block *old;
585 int err;
586
587 /* We don't yet pass the user namespace of the parent
588 * mount through to here so always use &init_user_ns
589 * until that changes.
590 */
591 if (flags & SB_SUBMOUNT)
592 user_ns = &init_user_ns;
593
594 retry:
595 spin_lock(&sb_lock);
596 if (test) {
597 hlist_for_each_entry(old, &type->fs_supers, s_instances) {
598 if (!test(old, data))
599 continue;
600 if (user_ns != old->s_user_ns) {
601 spin_unlock(&sb_lock);
602 destroy_unused_super(s);
603 return ERR_PTR(-EBUSY);
604 }
605 if (!grab_super(old))
606 goto retry;
607 destroy_unused_super(s);
608 return old;
609 }
610 }
611 if (!s) {
612 spin_unlock(&sb_lock);
613 s = alloc_super(type, (flags & ~SB_SUBMOUNT), user_ns);
614 if (!s)
615 return ERR_PTR(-ENOMEM);
616 goto retry;
617 }
618
619 err = set(s, data);
620 if (err) {
621 spin_unlock(&sb_lock);
622 destroy_unused_super(s);
623 return ERR_PTR(err);
624 }
625 s->s_type = type;
626 strlcpy(s->s_id, type->name, sizeof(s->s_id));
627 list_add_tail(&s->s_list, &super_blocks);
628 hlist_add_head(&s->s_instances, &type->fs_supers);
629 spin_unlock(&sb_lock);
630 get_filesystem(type);
631 register_shrinker_prepared(&s->s_shrink);
632 return s;
633 }
8.3、alloc_super()
/// fs/ext4/super.c
191 /**
192 * alloc_super - create new superblock
193 * @type: filesystem type superblock should belong to
194 * @flags: the mount flags
195 * @user_ns: User namespace for the super_block
196 *
197 * Allocates and initializes a new &struct super_block. alloc_super()
198 * returns a pointer new superblock or %NULL if allocation had failed.
199 */
200 static struct super_block *alloc_super(struct file_system_type *type, int flags,
201 struct user_namespace *user_ns)
202 {
203 struct super_block *s = kzalloc(sizeof(struct super_block), GFP_USER);
204 static const struct super_operations default_op;
205 int i;
206
207 if (!s)
208 return NULL;
209
210 INIT_LIST_HEAD(&s->s_mounts);
211 s->s_user_ns = get_user_ns(user_ns);
212 init_rwsem(&s->s_umount);
213 lockdep_set_class(&s->s_umount, &type->s_umount_key);
214 /*
215 * sget() can have s_umount recursion.
216 *
217 * When it cannot find a suitable sb, it allocates a new
218 * one (this one), and tries again to find a suitable old
219 * one.
220 *
221 * In case that succeeds, it will acquire the s_umount
222 * lock of the old one. Since these are clearly distrinct
223 * locks, and this object isn't exposed yet, there's no
224 * risk of deadlocks.
225 *
226 * Annotate this by putting this lock in a different
227 * subclass.
228 */
229 down_write_nested(&s->s_umount, SINGLE_DEPTH_NESTING);
230
231 if (security_sb_alloc(s))
232 goto fail;
233
234 for (i = 0; i < SB_FREEZE_LEVELS; i++) {
235 if (__percpu_init_rwsem(&s->s_writers.rw_sem[i],
236 sb_writers_name[i],
237 &type->s_writers_key[i]))
238 goto fail;
239 }
240 init_waitqueue_head(&s->s_writers.wait_unfrozen);
241 s->s_bdi = &noop_backing_dev_info;
242 s->s_flags = flags;
243 if (s->s_user_ns != &init_user_ns)
244 s->s_iflags |= SB_I_NODEV;
245 INIT_HLIST_NODE(&s->s_instances);
246 INIT_HLIST_BL_HEAD(&s->s_roots);
247 mutex_init(&s->s_sync_lock);
248 INIT_LIST_HEAD(&s->s_inodes);
249 spin_lock_init(&s->s_inode_list_lock);
250 INIT_LIST_HEAD(&s->s_inodes_wb);
251 spin_lock_init(&s->s_inode_wblist_lock);
252
253 s->s_count = 1;
254 atomic_set(&s->s_active, 1);
255 mutex_init(&s->s_vfs_rename_mutex);
256 lockdep_set_class(&s->s_vfs_rename_mutex, &type->s_vfs_rename_key);
257 init_rwsem(&s->s_dquot.dqio_sem);
258 s->s_maxbytes = MAX_NON_LFS;
259 s->s_op = &default_op;
260 s->s_time_gran = 1000000000;
261 s->s_time_min = TIME64_MIN;
262 s->s_time_max = TIME64_MAX;
263 s->cleancache_poolid = CLEANCACHE_NO_POOL;
264
265 s->s_shrink.seeks = DEFAULT_SEEKS;
266 s->s_shrink.scan_objects = super_cache_scan;
267 s->s_shrink.count_objects = super_cache_count;
268 s->s_shrink.batch = 1024;
269 s->s_shrink.flags = SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE;
270 if (prealloc_shrinker(&s->s_shrink))
271 goto fail;
272 if (list_lru_init_memcg(&s->s_dentry_lru, &s->s_shrink))
273 goto fail;
274 if (list_lru_init_memcg(&s->s_inode_lru, &s->s_shrink))
275 goto fail;
276 return s;
277
278 fail:
279 destroy_unused_super(s);
280 return NULL;
281 }
9、ext4_create
当创建一个文件时,会进入 ext4_create() 函数
9.1、ext4_create()
创建一个新的文件,就会分配一个 inode,ext4_create() 函数的核心是创建一个 inode。
/// fs/ext4/namei.c
2626 /*
2627 * By the time this is called, we already have created
2628 * the directory cache entry for the new file, but it
2629 * is so far negative - it has no inode.
2630 *
2631 * If the create succeeds, we fill in the inode information
2632 * with d_instantiate().
2633 */
2634 static int ext4_create(struct inode *dir, struct dentry *dentry, umode_t mode,
2635 bool excl)
2636 {
2637 handle_t *handle;
2638 struct inode *inode;
2639 int err, credits, retries = 0;
2640
2641 err = dquot_initialize(dir);
2642 if (err)
2643 return err;
2644
2645 credits = (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
2646 EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3);
2647 retry:
2648 inode = ext4_new_inode_start_handle(dir, mode, &dentry->d_name, 0,
2649 NULL, EXT4_HT_DIR, credits);
2650 handle = ext4_journal_current_handle();
2651 err = PTR_ERR(inode);
2652 if (!IS_ERR(inode)) {
2653 inode->i_op = &ext4_file_inode_operations;
2654 inode->i_fop = &ext4_file_operations;
2655 ext4_set_aops(inode);
2656 err = ext4_add_nondir(handle, dentry, &inode);
2657 if (!err)
2658 ext4_fc_track_create(handle, dentry);
2659 }
2660 if (handle)
2661 ext4_journal_stop(handle);
2662 if (!IS_ERR_OR_NULL(inode))
2663 iput(inode);
2664 if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
2665 goto retry;
2666 return err;
2667 }
ext4_new_inode_start_handle() 定义如下
/// fs/ext4/ext4.h
2737 #define ext4_new_inode_start_handle(dir, mode, qstr, goal, owner, \
2738 type, nblocks) \
2739 __ext4_new_inode(NULL, (dir), (mode), (qstr), (goal), (owner), \
2740 0, (type), __LINE__, (nblocks))
__ext4_new_inode() 定义如下
/// fs/ext4/ialloc.c
913 /*
914 * There are two policies for allocating an inode. If the new inode is
915 * a directory, then a forward search is made for a block group with both
916 * free space and a low directory-to-inode ratio; if that fails, then of
917 * the groups with above-average free space, that group with the fewest
918 * directories already is chosen.
919 *
920 * For other inodes, search forward from the parent directory's block
921 * group to find a free inode.
922 */
923 struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir,
924 umode_t mode, const struct qstr *qstr,
925 __u32 goal, uid_t *owner, __u32 i_flags,
926 int handle_type, unsigned int line_no,
927 int nblocks)
928 {
929 struct super_block *sb;
930 struct buffer_head *inode_bitmap_bh = NULL;
931 struct buffer_head *group_desc_bh;
932 ext4_group_t ngroups, group = 0;
933 unsigned long ino = 0;
934 struct inode *inode;
935 struct ext4_group_desc *gdp = NULL;
936 struct ext4_inode_info *ei;
937 struct ext4_sb_info *sbi;
938 int ret2, err;
939 struct inode *ret;
940 ext4_group_t i;
941 ext4_group_t flex_group;
942 struct ext4_group_info *grp = NULL;
943 bool encrypt = false;
944
945 /* Cannot create files in a deleted directory */
946 if (!dir || !dir->i_nlink)
947 return ERR_PTR(-EPERM);
948
949 sb = dir->i_sb;
950 sbi = EXT4_SB(sb);
951
952 if (unlikely(ext4_forced_shutdown(sbi)))
953 return ERR_PTR(-EIO);
954
955 ngroups = ext4_get_groups_count(sb);
956 trace_ext4_request_inode(dir, mode);
957 inode = new_inode(sb);
958 if (!inode)
959 return ERR_PTR(-ENOMEM);
960 ei = EXT4_I(inode);
// ...
9.2、alloc_inode()
new_inode() ==> new_inode_pseudo() ==> alloc_inode()
/// fs/inode.c
228 static struct inode *alloc_inode(struct super_block *sb)
229 {
230 const struct super_operations *ops = sb->s_op;
231 struct inode *inode;
232
233 if (ops->alloc_inode)
234 inode = ops->alloc_inode(sb);
235 else
236 inode = kmem_cache_alloc(inode_cachep, GFP_KERNEL);
237
238 if (!inode)
239 return NULL;
240
241 if (unlikely(inode_init_always(sb, inode))) {
242 if (ops->destroy_inode) {
243 ops->destroy_inode(inode);
244 if (!ops->free_inode)
245 return NULL;
246 }
247 inode->free_inode = ops->free_inode;
248 i_callback(&inode->i_rcu);
249 return NULL;
250 }
251
252 return inode;
253 }
918 /**
919 * new_inode_pseudo - obtain an inode
920 * @sb: superblock
921 *
922 * Allocates a new inode for given superblock.
923 * Inode wont be chained in superblock s_inodes list
924 * This means :
925 * - fs can't be unmount
926 * - quotas, fsnotify, writeback can't work
927 */
928 struct inode *new_inode_pseudo(struct super_block *sb)
929 {
930 struct inode *inode = alloc_inode(sb);
931
932 if (inode) {
933 spin_lock(&inode->i_lock);
934 inode->i_state = 0;
935 spin_unlock(&inode->i_lock);
936 INIT_LIST_HEAD(&inode->i_sb_list);
937 }
938 return inode;
939 }
940
941 /**
942 * new_inode - obtain an inode
943 * @sb: superblock
944 *
945 * Allocates a new inode for given superblock. The default gfp_mask
946 * for allocations related to inode->i_mapping is GFP_HIGHUSER_MOVABLE.
947 * If HIGHMEM pages are unsuitable or it is known that pages allocated
948 * for the page cache are not reclaimable or migratable,
949 * mapping_set_gfp_mask() must be called with suitable flags on the
950 * newly created inode's mapping
951 *
952 */
953 struct inode *new_inode(struct super_block *sb)
954 {
955 struct inode *inode;
956
957 spin_lock_prefetch(&sb->s_inode_list_lock);
958
959 inode = new_inode_pseudo(sb);
960 if (inode)
961 inode_sb_list_add(inode);
962 return inode;
963 }