ext4文件系统默认挂载选项

引入

Ext4的官方文档里,可以看到有很多挂载的选项,并且有一些被标记为了默认,比如delalloc。但是通过procfs/proc/mounts并没有看到这些默认的选项,比如delalloc(有个nodelalloc的disable delalloc选项,这两个是非此即彼的关系,却都没有出现)。

1
2
$ cat /proc/mounts| grep ext4
/dev/sdb / ext4 rw,relatime,discard,errors=remount-ro,data=ordered 0 0

而对于另一个文件系统相关的文件里,却能够看到这些完整的选项:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
$ cat /proc/fs/ext4/sdb/options    
rw
bsddf
nogrpid
block_validity
dioread_nolock
discard
delalloc
nowarn_on_error
journal_checksum
barrier
auto_da_alloc
user_xattr
acl
noquota
resuid=0
resgid=0
errors=remount-ro
commit=5
min_batch_time=0
max_batch_time=15000
stripe=0
data=ordered
inode_readahead_blks=32
init_itable=10
max_dir_size_kb=0

带着这个问题,基于6.1.36内核源码,梳理了文件系统如何通过procfs来展示挂载信息,并且在创建和挂载文件系统时如何处理挂载选项。

procfs的数据

/proc/{pid}/mounts

由于有了mount namespace,系统挂载点可以在各个进程之间相互隔离,不再是全局一致。所以/proc/mounts其实是指向/proc/self/mounts的符号链接。

1
2
$ ls -l /proc/mounts
lrwxrwxrwx 1 root root 11 Jul 30 05:31 /proc/mounts -> self/mounts

展示的函数是show_vfsmnt

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
// fs/proc_namespace.c
static int show_vfsmnt(struct seq_file *m, struct vfsmount *mnt)
{
struct proc_mounts *p = m->private;
struct mount *r = real_mount(mnt);
struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt };
struct super_block *sb = mnt_path.dentry->d_sb;
int err;

if (sb->s_op->show_devname) {
err = sb->s_op->show_devname(m, mnt_path.dentry);
if (err)
goto out;
} else {
mangle(m, r->mnt_devname ? r->mnt_devname : "none");
}
seq_putc(m, ' ');
/* mountpoints outside of chroot jail will give SEQ_SKIP on this */
err = seq_path_root(m, &mnt_path, &p->root, " \t\n\\");
if (err)
goto out;
seq_putc(m, ' ');
show_type(m, sb);
seq_puts(m, __mnt_is_readonly(mnt) ? " ro" : " rw");
err = show_sb_opts(m, sb);
if (err)
goto out;
show_mnt_opts(m, mnt);
if (sb->s_op->show_options)
err = sb->s_op->show_options(m, mnt_path.dentry);
seq_puts(m, " 0 0\n");
out:
return err;
}

其中主要有三个函数输出了options,show_sb_opts()show_mnt_opts(),还有sb->s_op->show_options,其中show_sb_opts()show_mnt_opts打印的是vfs层面通用的一些选项,比较少。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
// fs/proc_namespace.c
static int show_sb_opts(struct seq_file *m, struct super_block *sb)
{
static const struct proc_fs_opts fs_opts[] = {
{ SB_SYNCHRONOUS, ",sync" },
{ SB_DIRSYNC, ",dirsync" },
{ SB_MANDLOCK, ",mand" },
{ SB_LAZYTIME, ",lazytime" },
{ 0, NULL }
};
const struct proc_fs_opts *fs_infop;

for (fs_infop = fs_opts; fs_infop->flag; fs_infop++) {
if (sb->s_flags & fs_infop->flag)
seq_puts(m, fs_infop->str);
}

return security_sb_show_options(m, sb);
}

static void show_mnt_opts(struct seq_file *m, struct vfsmount *mnt)
{
static const struct proc_fs_opts mnt_opts[] = {
{ MNT_NOSUID, ",nosuid" },
{ MNT_NODEV, ",nodev" },
{ MNT_NOEXEC, ",noexec" },
{ MNT_NOATIME, ",noatime" },
{ MNT_NODIRATIME, ",nodiratime" },
{ MNT_RELATIME, ",relatime" },
{ MNT_NOSYMFOLLOW, ",nosymfollow" },
{ 0, NULL }
};
const struct proc_fs_opts *fs_infop;

for (fs_infop = mnt_opts; fs_infop->flag; fs_infop++) {
if (mnt->mnt_flags & fs_infop->flag)
seq_puts(m, fs_infop->str);
}

if (mnt_user_ns(mnt) != &init_user_ns)
seq_puts(m, ",idmapped");
}

sb->s_op->show_options,对于ext4文件系统来说,是_ext4_show_options

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
// fs/ext4/super.c
static const struct super_operations ext4_sops = {
// ...
.show_options = ext4_show_options,
// ...
};

static int ext4_show_options(struct seq_file *seq, struct dentry *root)
{
return _ext4_show_options(seq, root->d_sb, 0);
}

/*
* Show an option if
* - it's set to a non-default value OR
* - if the per-sb default is different from the global default
*/
static int _ext4_show_options(struct seq_file *seq, struct super_block *sb,
int nodefs)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
struct ext4_super_block *es = sbi->s_es;
int def_errors, def_mount_opt = sbi->s_def_mount_opt;
const struct mount_opts *m;
char sep = nodefs ? '\n' : ',';

#define SEQ_OPTS_PUTS(str) seq_printf(seq, "%c" str, sep)
#define SEQ_OPTS_PRINT(str, arg) seq_printf(seq, "%c" str, sep, arg)

if (sbi->s_sb_block != 1)
SEQ_OPTS_PRINT("sb=%llu", sbi->s_sb_block);

for (m = ext4_mount_opts; m->token != Opt_err; m++) {
int want_set = m->flags & MOPT_SET;
if (((m->flags & (MOPT_SET|MOPT_CLEAR)) == 0) ||
(m->flags & MOPT_CLEAR_ERR) || m->flags & MOPT_SKIP)
continue;
if (!nodefs && !(m->mount_opt & (sbi->s_mount_opt ^ def_mount_opt)))
continue; /* skip if same as the default */
if ((want_set &&
(sbi->s_mount_opt & m->mount_opt) != m->mount_opt) ||
(!want_set && (sbi->s_mount_opt & m->mount_opt)))
continue; /* select Opt_noFoo vs Opt_Foo */
SEQ_OPTS_PRINT("%s", token2str(m->token));
}

if (nodefs || !uid_eq(sbi->s_resuid, make_kuid(&init_user_ns, EXT4_DEF_RESUID)) ||
le16_to_cpu(es->s_def_resuid) != EXT4_DEF_RESUID)
SEQ_OPTS_PRINT("resuid=%u",
from_kuid_munged(&init_user_ns, sbi->s_resuid));
if (nodefs || !gid_eq(sbi->s_resgid, make_kgid(&init_user_ns, EXT4_DEF_RESGID)) ||
le16_to_cpu(es->s_def_resgid) != EXT4_DEF_RESGID)
SEQ_OPTS_PRINT("resgid=%u",
from_kgid_munged(&init_user_ns, sbi->s_resgid));
def_errors = nodefs ? -1 : le16_to_cpu(es->s_errors);
if (test_opt(sb, ERRORS_RO) && def_errors != EXT4_ERRORS_RO)
SEQ_OPTS_PUTS("errors=remount-ro");
if (test_opt(sb, ERRORS_CONT) && def_errors != EXT4_ERRORS_CONTINUE)
SEQ_OPTS_PUTS("errors=continue");
if (test_opt(sb, ERRORS_PANIC) && def_errors != EXT4_ERRORS_PANIC)
SEQ_OPTS_PUTS("errors=panic");
if (nodefs || sbi->s_commit_interval != JBD2_DEFAULT_MAX_COMMIT_AGE*HZ)
SEQ_OPTS_PRINT("commit=%lu", sbi->s_commit_interval / HZ);
if (nodefs || sbi->s_min_batch_time != EXT4_DEF_MIN_BATCH_TIME)
SEQ_OPTS_PRINT("min_batch_time=%u", sbi->s_min_batch_time);
if (nodefs || sbi->s_max_batch_time != EXT4_DEF_MAX_BATCH_TIME)
SEQ_OPTS_PRINT("max_batch_time=%u", sbi->s_max_batch_time);
if (sb->s_flags & SB_I_VERSION)
SEQ_OPTS_PUTS("i_version");
if (nodefs || sbi->s_stripe)
SEQ_OPTS_PRINT("stripe=%lu", sbi->s_stripe);
if (nodefs || EXT4_MOUNT_DATA_FLAGS &
(sbi->s_mount_opt ^ def_mount_opt)) {
if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)
SEQ_OPTS_PUTS("data=journal");
else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA)
SEQ_OPTS_PUTS("data=ordered");
else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)
SEQ_OPTS_PUTS("data=writeback");
}
if (nodefs ||
sbi->s_inode_readahead_blks != EXT4_DEF_INODE_READAHEAD_BLKS)
SEQ_OPTS_PRINT("inode_readahead_blks=%u",
sbi->s_inode_readahead_blks);

if (test_opt(sb, INIT_INODE_TABLE) && (nodefs ||
(sbi->s_li_wait_mult != EXT4_DEF_LI_WAIT_MULT)))
SEQ_OPTS_PRINT("init_itable=%u", sbi->s_li_wait_mult);
if (nodefs || sbi->s_max_dir_size_kb)
SEQ_OPTS_PRINT("max_dir_size_kb=%u", sbi->s_max_dir_size_kb);
if (test_opt(sb, DATA_ERR_ABORT))
SEQ_OPTS_PUTS("data_err=abort");

fscrypt_show_test_dummy_encryption(seq, sep, sb);

if (sb->s_flags & SB_INLINECRYPT)
SEQ_OPTS_PUTS("inlinecrypt");

if (test_opt(sb, DAX_ALWAYS)) {
if (IS_EXT2_SB(sb))
SEQ_OPTS_PUTS("dax");
else
SEQ_OPTS_PUTS("dax=always");
} else if (test_opt2(sb, DAX_NEVER)) {
SEQ_OPTS_PUTS("dax=never");
} else if (test_opt2(sb, DAX_INODE)) {
SEQ_OPTS_PUTS("dax=inode");
}
ext4_show_quota_options(seq, sb);
return 0;
}

/proc/fs/ext4/{device}/options

在ext4文件系统被挂载的时候,会在__ext4_fill_super()里调用ext4_register_sysfs()来注册procfs的条目:

1
2
3
4
5
6
7
8
9
10
11
// fs/ext4/sysfs.c
int ext4_register_sysfs(struct super_block *sb)
{
// ...
if (sbi->s_proc) {
proc_create_single_data("options", S_IRUGO, sbi->s_proc,
ext4_seq_options_show, sb);
// ...
}
return 0;
}

展示数据的函数为ext4_seq_options_show(),最终也是调用了_ext4_show_options(),不过最后一个参数nodefs为1,导致和/proc/{pid}/mounts的输出不一致。

1
2
3
4
5
6
7
8
9
10
11
// fs/ext4/super.c
int ext4_seq_options_show(struct seq_file *seq, void *offset)
{
struct super_block *sb = seq->private;
int rc;

seq_puts(seq, sb_rdonly(sb) ? "ro" : "rw");
rc = _ext4_show_options(seq, sb, 1);
seq_puts(seq, "\n");
return rc;
}

其中,导致差异的点有两个,一个是导致sep为’,’还是’\n’;另一个是nodefs为0的情况下,会省略一些选项的输出。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
static int _ext4_show_options(struct seq_file *seq, struct super_block *sb,
int nodefs)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
struct ext4_super_block *es = sbi->s_es;
int def_errors, def_mount_opt = sbi->s_def_mount_opt;
const struct mount_opts *m;
char sep = nodefs ? '\n' : ',';

if (sbi->s_sb_block != 1)
SEQ_OPTS_PRINT("sb=%llu", sbi->s_sb_block);

for (m = ext4_mount_opts; m->token != Opt_err; m++) {
int want_set = m->flags & MOPT_SET;
if (((m->flags & (MOPT_SET|MOPT_CLEAR)) == 0) ||
m->flags & MOPT_SKIP)
continue;
if (!nodefs && !(m->mount_opt & (sbi->s_mount_opt ^ def_mount_opt)))
continue; /* skip if same as the default */
if ((want_set &&
(sbi->s_mount_opt & m->mount_opt) != m->mount_opt) ||
(!want_set && (sbi->s_mount_opt & m->mount_opt)))
continue; /* select Opt_noFoo vs Opt_Foo */
SEQ_OPTS_PRINT("%s", token2str(m->token));
}

if (nodefs || !uid_eq(sbi->s_resuid, make_kuid(&init_user_ns, EXT4_DEF_RESUID)) ||
le16_to_cpu(es->s_def_resuid) != EXT4_DEF_RESUID)
SEQ_OPTS_PRINT("resuid=%u",
from_kuid_munged(&init_user_ns, sbi->s_resuid));
if (nodefs || !gid_eq(sbi->s_resgid, make_kgid(&init_user_ns, EXT4_DEF_RESGID)) ||
le16_to_cpu(es->s_def_resgid) != EXT4_DEF_RESGID)
SEQ_OPTS_PRINT("resgid=%u",
from_kgid_munged(&init_user_ns, sbi->s_resgid));
def_errors = nodefs ? -1 : le16_to_cpu(es->s_errors);
if (test_opt(sb, ERRORS_RO) && def_errors != EXT4_ERRORS_RO)
SEQ_OPTS_PUTS("errors=remount-ro");
// ...
if (nodefs || sbi->s_commit_interval != JBD2_DEFAULT_MAX_COMMIT_AGE*HZ)
SEQ_OPTS_PRINT("commit=%lu", sbi->s_commit_interval / HZ);
if (nodefs || sbi->s_min_batch_time != EXT4_DEF_MIN_BATCH_TIME)
SEQ_OPTS_PRINT("min_batch_time=%u", sbi->s_min_batch_time);
if (nodefs || sbi->s_max_batch_time != EXT4_DEF_MAX_BATCH_TIME)
SEQ_OPTS_PRINT("max_batch_time=%u", sbi->s_max_batch_time);
if (nodefs || sbi->s_stripe)
SEQ_OPTS_PRINT("stripe=%lu", sbi->s_stripe);
if (nodefs || EXT4_MOUNT_DATA_FLAGS &
(sbi->s_mount_opt ^ def_mount_opt)) {
if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)
SEQ_OPTS_PUTS("data=journal");
else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA)
SEQ_OPTS_PUTS("data=ordered");
else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)
SEQ_OPTS_PUTS("data=writeback");
}
if (nodefs ||
sbi->s_inode_readahead_blks != EXT4_DEF_INODE_READAHEAD_BLKS)
SEQ_OPTS_PRINT("inode_readahead_blks=%u",
sbi->s_inode_readahead_blks);

if (test_opt(sb, INIT_INODE_TABLE) && (nodefs ||
(sbi->s_li_wait_mult != EXT4_DEF_LI_WAIT_MULT)))
SEQ_OPTS_PRINT("init_itable=%u", sbi->s_li_wait_mult);
if (nodefs || sbi->s_max_dir_size_kb)
SEQ_OPTS_PRINT("max_dir_size_kb=%u", sbi->s_max_dir_size_kb);
// ...
}

看来起来主要就是nodefs为1和0的两种情况导致输出不一致。可以看到这段逻辑,如果是default的options,并且nodefs为0,就跳过。

1
2
if (!nodefs && !(m->mount_opt & (sbi->s_mount_opt ^ def_mount_opt)))
continue; /* skip if same as the default */

这里default_options应该就是文档里描述的那些。我先入为主的以为这个就是磁盘上super_block的s_default_opts字段,于是通过tune_2fs查看了下,发现并不是。

1
2
3
4
5
$ sudo tune2fs -l /dev/sdb
tune2fs 1.46.5 (30-Dec-2021)
// ...
Default mount options: user_xattr acl
// ...

这里只有user_xattr和acl,理论上其他的那些options当nodefs为0时也会输出,比如delalloc。但是却并没有。所以,还得探究下这个sbi->s_def_mount_opt字段究竟是如何被设置的。

默认挂载选项

熟悉linux文件系统的都知道,vfs会有个通用的super block,每个文件系统也会有自己的super block,它们在磁盘和在内存上都会有些许差距。对于ext4来说,它在磁盘上的super block布局为

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
// fs/ext4/ext4.h
/*
* Structure of the super block
*/
struct ext4_super_block {
/*00*/ __le32 s_inodes_count; /* Inodes count */
__le32 s_blocks_count_lo; /* Blocks count */
__le32 s_r_blocks_count_lo; /* Reserved blocks count */
__le32 s_free_blocks_count_lo; /* Free blocks count */
/*10*/ __le32 s_free_inodes_count; /* Free inodes count */
__le32 s_first_data_block; /* First Data Block */
__le32 s_log_block_size; /* Block size */
__le32 s_log_cluster_size; /* Allocation cluster size */
/*20*/ __le32 s_blocks_per_group; /* # Blocks per group */
__le32 s_clusters_per_group; /* # Clusters per group */
__le32 s_inodes_per_group; /* # Inodes per group */
__le32 s_mtime; /* Mount time */
/*30*/ __le32 s_wtime; /* Write time */
__le16 s_mnt_count; /* Mount count */
__le16 s_max_mnt_count; /* Maximal mount count */
__le16 s_magic; /* Magic signature */
__le16 s_state; /* File system state */
__le16 s_errors; /* Behaviour when detecting errors */
__le16 s_minor_rev_level; /* minor revision level */
/*40*/ __le32 s_lastcheck; /* time of last check */
__le32 s_checkinterval; /* max. time between checks */
__le32 s_creator_os; /* OS */
__le32 s_rev_level; /* Revision level */
/*50*/ __le16 s_def_resuid; /* Default uid for reserved blocks */
__le16 s_def_resgid; /* Default gid for reserved blocks */
/*
* These fields are for EXT4_DYNAMIC_REV superblocks only.
*
* Note: the difference between the compatible feature set and
* the incompatible feature set is that if there is a bit set
* in the incompatible feature set that the kernel doesn't
* know about, it should refuse to mount the filesystem.
*
* e2fsck's requirements are more strict; if it doesn't know
* about a feature in either the compatible or incompatible
* feature set, it must abort and not try to meddle with
* things it doesn't understand...
*/
__le32 s_first_ino; /* First non-reserved inode */
__le16 s_inode_size; /* size of inode structure */
__le16 s_block_group_nr; /* block group # of this superblock */
__le32 s_feature_compat; /* compatible feature set */
/*60*/ __le32 s_feature_incompat; /* incompatible feature set */
__le32 s_feature_ro_compat; /* readonly-compatible feature set */
/*68*/ __u8 s_uuid[16]; /* 128-bit uuid for volume */
/*78*/ char s_volume_name[EXT4_LABEL_MAX]; /* volume name */
/*88*/ char s_last_mounted[64] __nonstring; /* directory where last mounted */
/*C8*/ __le32 s_algorithm_usage_bitmap; /* For compression */
/*
* Performance hints. Directory preallocation should only
* happen if the EXT4_FEATURE_COMPAT_DIR_PREALLOC flag is on.
*/
__u8 s_prealloc_blocks; /* Nr of blocks to try to preallocate*/
__u8 s_prealloc_dir_blocks; /* Nr to preallocate for dirs */
__le16 s_reserved_gdt_blocks; /* Per group desc for online growth */
/*
* Journaling support valid if EXT4_FEATURE_COMPAT_HAS_JOURNAL set.
*/
/*D0*/ __u8 s_journal_uuid[16]; /* uuid of journal superblock */
/*E0*/ __le32 s_journal_inum; /* inode number of journal file */
__le32 s_journal_dev; /* device number of journal file */
__le32 s_last_orphan; /* start of list of inodes to delete */
__le32 s_hash_seed[4]; /* HTREE hash seed */
__u8 s_def_hash_version; /* Default hash version to use */
__u8 s_jnl_backup_type;
__le16 s_desc_size; /* size of group descriptor */
/*100*/ __le32 s_default_mount_opts;
__le32 s_first_meta_bg; /* First metablock block group */
__le32 s_mkfs_time; /* When the filesystem was created */
__le32 s_jnl_blocks[17]; /* Backup of the journal inode */
/* 64bit support valid if EXT4_FEATURE_COMPAT_64BIT */
/*150*/ __le32 s_blocks_count_hi; /* Blocks count */
__le32 s_r_blocks_count_hi; /* Reserved blocks count */
__le32 s_free_blocks_count_hi; /* Free blocks count */
__le16 s_min_extra_isize; /* All inodes have at least # bytes */
__le16 s_want_extra_isize; /* New inodes should reserve # bytes */
__le32 s_flags; /* Miscellaneous flags */
__le16 s_raid_stride; /* RAID stride */
__le16 s_mmp_update_interval; /* # seconds to wait in MMP checking */
__le64 s_mmp_block; /* Block for multi-mount protection */
__le32 s_raid_stripe_width; /* blocks on all data disks (N*stride)*/
__u8 s_log_groups_per_flex; /* FLEX_BG group size */
__u8 s_checksum_type; /* metadata checksum algorithm used */
__u8 s_encryption_level; /* versioning level for encryption */
__u8 s_reserved_pad; /* Padding to next 32bits */
__le64 s_kbytes_written; /* nr of lifetime kilobytes written */
__le32 s_snapshot_inum; /* Inode number of active snapshot */
__le32 s_snapshot_id; /* sequential ID of active snapshot */
__le64 s_snapshot_r_blocks_count; /* reserved blocks for active
snapshot's future use */
__le32 s_snapshot_list; /* inode number of the head of the
on-disk snapshot list */
#define EXT4_S_ERR_START offsetof(struct ext4_super_block, s_error_count)
__le32 s_error_count; /* number of fs errors */
__le32 s_first_error_time; /* first time an error happened */
__le32 s_first_error_ino; /* inode involved in first error */
__le64 s_first_error_block; /* block involved of first error */
__u8 s_first_error_func[32] __nonstring; /* function where the error happened */
__le32 s_first_error_line; /* line number where error happened */
__le32 s_last_error_time; /* most recent time of an error */
__le32 s_last_error_ino; /* inode involved in last error */
__le32 s_last_error_line; /* line number where error happened */
__le64 s_last_error_block; /* block involved of last error */
__u8 s_last_error_func[32] __nonstring; /* function where the error happened */
#define EXT4_S_ERR_END offsetof(struct ext4_super_block, s_mount_opts)
__u8 s_mount_opts[64];
__le32 s_usr_quota_inum; /* inode for tracking user quota */
__le32 s_grp_quota_inum; /* inode for tracking group quota */
__le32 s_overhead_clusters; /* overhead blocks/clusters in fs */
__le32 s_backup_bgs[2]; /* groups with sparse_super2 SBs */
__u8 s_encrypt_algos[4]; /* Encryption algorithms in use */
__u8 s_encrypt_pw_salt[16]; /* Salt used for string2key algorithm */
__le32 s_lpf_ino; /* Location of the lost+found inode */
__le32 s_prj_quota_inum; /* inode for tracking project quota */
__le32 s_checksum_seed; /* crc32c(uuid) if csum_seed set */
__u8 s_wtime_hi;
__u8 s_mtime_hi;
__u8 s_mkfs_time_hi;
__u8 s_lastcheck_hi;
__u8 s_first_error_time_hi;
__u8 s_last_error_time_hi;
__u8 s_first_error_errcode;
__u8 s_last_error_errcode;
__le16 s_encoding; /* Filename charset encoding */
__le16 s_encoding_flags; /* Filename charset encoding flags */
__le32 s_orphan_file_inum; /* Inode for tracking orphan inodes */
__le32 s_reserved[94]; /* Padding to the end of the block */
__le32 s_checksum; /* crc32c(superblock) */
};

它会有个字段s_default_mount_opts,其实就是tune2fs工具展示的Default mount options,这个值是在磁盘上永久保存的,一般都是当mkfs创建文件系统的时候写入,也可以通过tune2fs工具来修改。

它允许的默认值包括如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
// fs/ext4/ext4.h
/*
* Default mount options
*/
#define EXT4_DEFM_DEBUG 0x0001
#define EXT4_DEFM_BSDGROUPS 0x0002
#define EXT4_DEFM_XATTR_USER 0x0004
#define EXT4_DEFM_ACL 0x0008
#define EXT4_DEFM_UID16 0x0010
#define EXT4_DEFM_JMODE 0x0060
#define EXT4_DEFM_JMODE_DATA 0x0020
#define EXT4_DEFM_JMODE_ORDERED 0x0040
#define EXT4_DEFM_JMODE_WBACK 0x0060
#define EXT4_DEFM_NOBARRIER 0x0100
#define EXT4_DEFM_BLOCK_VALIDITY 0x0200
#define EXT4_DEFM_DISCARD 0x0400
#define EXT4_DEFM_NODELALLOC 0x0800

mkfs可以通过配置文件来设置创建文件系统后super block里default_mntopts字段的值:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
// /etc/mke2fs.conf
[defaults]
base_features = sparse_super,large_file,filetype,resize_inode,dir_index,ext_attr
default_mntopts = acl,user_xattr
enable_periodic_fsck = 0
blocksize = 4096
inode_size = 256
inode_ratio = 16384

[fs_types]
ext3 = {
features = has_journal
}
ext4 = {
features = has_journal,extent,huge_file,flex_bg,metadata_csum,64bit,dir_nlink,extra_isize
}
small = {
inode_ratio = 4096
}
floppy = {
inode_ratio = 8192
}
big = {
inode_ratio = 32768
}
huge = {
inode_ratio = 65536
}
news = {
inode_ratio = 4096
}
largefile = {
inode_ratio = 1048576
blocksize = -1
}
largefile4 = {
inode_ratio = 4194304
blocksize = -1
}
hurd = {
blocksize = 4096
inode_size = 128
warn_y2038_dates = 0
}

可以确定,内存里加载过后的superblock的字段s_def_mount_opt和磁盘上super block的字段s_default_mount_opts实际上并非对应的关系,在内核挂载阶段的内核代码里会对ext4_sb_info -> s_def_mount_opt进行设置。有意思的是,可以在磁盘上的superblock里设置EXT4_DEFM_NODELALLOC,从而改变挂载时默认delalloc的逻辑。

接下来梳理了一下内核代码关于mount option的设置流程:sys_mount() -> do_mount() -> path_mount() -> do_new_mount() -> vfs_get_tree() -> ext4_get_tree() -> get_tree_bdev() -> ext4_fill_super -> __ext4_fill_super()

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
// fs/ext4/super.c
static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
{
struct ext4_super_block *es = NULL;
struct ext4_sb_info *sbi = EXT4_SB(sb);
struct flex_groups **flex_groups;
ext4_fsblk_t block;
struct inode *root;
struct ext4_fs_context *ctx = fc->fs_private;

// 加载sbi->s_es,es指向磁盘上布局的super block数据
err = ext4_load_super(sb, &logical_sb_block, silent);

es = sbi->s_es;

// 解析es->s_default_mount_opts,这也是mkfs时可以设置的挂载options
// 这个字段为字符串
ext4_set_def_opts(sb, es);

// 解析es->s_mount_opts
err = parse_apply_sb_mount_options(sb, ctx);
if (err < 0)
goto failed_mount;

// 这里的赋值很关键,会把上述解析出来的option都列为s_def_mount_opt
sbi->s_def_mount_opt = sbi->s_mount_opt;

// 这里在设置挂载时参数附带的options,不再将其设置为s_def_mount_opt
ext4_apply_options(fc, sb);

// 这里也有可能设置sbi->s_def_mount_opt
if (!test_opt(sb, NOLOAD) && ext4_has_feature_journal(sb)) {
err = ext4_load_and_init_journal(sb, es, ctx);
if (err)
goto failed_mount3a;
}
}

// 通过es->s_default_mount_opts来设置sbi->s_mount_opt
static void ext4_set_def_opts(struct super_block *sb,
struct ext4_super_block *es)
{
unsigned long def_mount_opts;

/* Set defaults before we parse the mount options */
def_mount_opts = le32_to_cpu(es->s_default_mount_opts);
set_opt(sb, INIT_INODE_TABLE);
if (def_mount_opts & EXT4_DEFM_DEBUG)
set_opt(sb, DEBUG);
if (def_mount_opts & EXT4_DEFM_BSDGROUPS)
set_opt(sb, GRPID);
if (def_mount_opts & EXT4_DEFM_UID16)
set_opt(sb, NO_UID32);
/* xattr user namespace & acls are now defaulted on */
set_opt(sb, XATTR_USER);
#ifdef CONFIG_EXT4_FS_POSIX_ACL
set_opt(sb, POSIX_ACL);
#endif
if (ext4_has_feature_fast_commit(sb))
set_opt2(sb, JOURNAL_FAST_COMMIT);
/* don't forget to enable journal_csum when metadata_csum is enabled. */
if (ext4_has_metadata_csum(sb))
set_opt(sb, JOURNAL_CHECKSUM);

if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_DATA)
set_opt(sb, JOURNAL_DATA);
else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_ORDERED)
set_opt(sb, ORDERED_DATA);
else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_WBACK)
set_opt(sb, WRITEBACK_DATA);

if (le16_to_cpu(es->s_errors) == EXT4_ERRORS_PANIC)
set_opt(sb, ERRORS_PANIC);
else if (le16_to_cpu(es->s_errors) == EXT4_ERRORS_CONTINUE)
set_opt(sb, ERRORS_CONT);
else
set_opt(sb, ERRORS_RO);
/* block_validity enabled by default; disable with noblock_validity */
set_opt(sb, BLOCK_VALIDITY);
if (def_mount_opts & EXT4_DEFM_DISCARD)
set_opt(sb, DISCARD);

if ((def_mount_opts & EXT4_DEFM_NOBARRIER) == 0)
set_opt(sb, BARRIER);

/*
* enable delayed allocation by default
* Use -o nodelalloc to turn it off
*/
if (!IS_EXT3_SB(sb) && !IS_EXT2_SB(sb) &&
((def_mount_opts & EXT4_DEFM_NODELALLOC) == 0))
set_opt(sb, DELALLOC);

if (sb->s_blocksize == PAGE_SIZE)
set_opt(sb, DIOREAD_NOLOCK);
}

// 如果之前没有设置过EXT4_MOUNT_JOURNAL_DATA字段,也会将其设置成默认字段
static int ext4_load_and_init_journal(struct super_block *sb,
struct ext4_super_block *es,
struct ext4_fs_context *ctx)
{
// ...

/* We have now updated the journal if required, so we can
* validate the data journaling mode. */
switch (test_opt(sb, DATA_FLAGS)) {
case 0:
/* No mode set, assume a default based on the journal
* capabilities: ORDERED_DATA if the journal can
* cope, else JOURNAL_DATA
*/
if (jbd2_journal_check_available_features
(sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE)) {
set_opt(sb, ORDERED_DATA);
sbi->s_def_mount_opt |= EXT4_MOUNT_ORDERED_DATA;
} else {
set_opt(sb, JOURNAL_DATA);
sbi->s_def_mount_opt |= EXT4_MOUNT_JOURNAL_DATA;
}
break;
case EXT4_MOUNT_ORDERED_DATA:
case EXT4_MOUNT_WRITEBACK_DATA:
// ...
}
// ...
}

可以看到,在挂载的过程中,sbi->s_def_mount_optes->s_mount_opts并非简单的对应关系,ext4会对sbi->s_def_mount_opt进行额外的设置。

总结

Ext4文件系统的mount option设置来源有几个地方,一是通过mkfs时在磁盘上的superblock写入,二是挂载时的参数。但是这些和内存中文件系统相关的mount options和default mount options都不是简单的对应关系,ext4会根据其他的信息设置options。

查看ext4挂载选项认准/proc/fs/ext4/{device}/options